In [1]:
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
import time
pd.set_option('display.max_rows', 200)
%matplotlib inline

# Import & Merge Data 

Imports 3 datasets used for project: (1) Master Proceedings as processed by Dunn, (2) Judge Bios, (3) Appeals

In [2]:
# define folder where data resides 
DATAFOLDER = "~/Documents/data-science-coursework/nyu-ml/project/"

### Master Proceedings (processed by Sagent/Dunn)

In [3]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'data/AsylumAdj/data_for_model/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn.rename(columns={'dec_type_string': 'original_dec_type_string', 
                            'dec_string': 'original_dec_string',
                            'grant': 'original_granted'}, 
                   inplace=True)
master_dunn.drop('Unnamed: 0', axis=1, inplace=True)
print(master_dunn.info()) 
print(master_dunn.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602500 entries, 0 to 602499
Columns: 181 entries, idncase to last_hearing_on_comp_date
dtypes: bool(1), float64(72), int64(38), object(70)
memory usage: 828.0+ MB
None
[u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', 'original_dec_type_string', u'_mdectype', u'outcome_recorded_in_field', 'original_dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'a

### Judge Bios 

In [4]:
judge_bio = pd.read_csv(os.path.join(DATAFOLDER, 'data/AsylumAdj/data/cleaned_judge_bios.csv')) 
judge_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Male_judge               367 non-null float64
Year_Appointed_SLR       367 non-null float64
Year_College_SLR         367 non-null float64
Year_Law_school_SLR      367 non-null float64
Government_Years_SLR     367 non-null float64
Govt_nonINS_SLR          367 non-null float64
INS_Years_SLR            367 non-null float64
Military_Years_SLR       367 non-null float64
NGO_Years_SLR            367 non-null float64
Privateprac_Years_SLR    367 non-null float64
Academia_Years_SLR       367 non-null float64
ij_code                  367 non-null object
dtypes: float64(11), object(1)
memory usage: 34.5+ KB


### Appeals

In [5]:
# import main table 
tblAppeal = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/tblAppeal.csv'), low_memory=False) 
tblAppeal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870388 entries, 0 to 870387
Data columns (total 17 columns):
idnAppeal             870388 non-null int64
idncase               868758 non-null float64
idnProceeding         776380 non-null float64
strAppealCategory     870388 non-null object
strAppealType         870388 non-null object
datAppealFiled        870226 non-null object
strFiledBy            870378 non-null object
datAttorneyE27        521008 non-null object
datBIADecision        847196 non-null object
strBIADecision        847180 non-null object
strBIADecisionType    822882 non-null object
strCaseType           824855 non-null object
strLang               773811 non-null object
strNat                777004 non-null object
strProceedingIHP      367364 non-null object
strCustody            666356 non-null object
strProbono            1880 non-null object
dtypes: float64(2), int64(1), object(14)
memory usage: 112.9+ MB


In [6]:
# import lookup tables 
bia_appeal_category = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'), 
                                    sheetname='BIA Appeal Category', skip_footer=7)
bia_appeal_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                sheetname='BIA Appeal Type', skip_footer=3)
bia_decision_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision type', skip_footer=2)
bia_decision_code = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision code', skip_footer=2)

In [7]:
# join them 
tblAppeal_df = tblAppeal.merge(bia_appeal_category, how='left', left_on='strAppealCategory', right_on='Code')\
                        .rename(columns={'Description': 'strAppealCategoryDesc'}).drop('Code', axis=1)\
                        .merge(bia_appeal_type, how='left', left_on='strAppealType', right_on='Code')\
                        .rename(columns={'Description': 'strAppealTypeDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_code, how='left', left_on='strBIADecision', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_type, how='left', left_on='strBIADecisionType', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionTypeDesc'}).drop('Code', axis=1)

In [8]:
# drop appeals with no case number, proceeding number, or decision 
tblAppeal.dropna(subset=['idncase', 'idnProceeding', 'strBIADecision'], inplace=True) 
tblAppeal = tblAppeal[tblAppeal['idnProceeding'] != 0] # drop zeros 

In [9]:
# some strBIADecision don't have corresponding code translations; also drop 
print(tblAppeal_df[pd.isnull(tblAppeal_df['strBIADecisionDesc'])]['strBIADecision'].value_counts()) 
tblAppeal_df.dropna(subset=['strBIADecisionDesc'], inplace=True)

DSO    5994
DED    2108
CPG    1704
ABC    1617
APD    1170
GRS     433
DNS     422
ADM     355
DMO     238
AFD     106
CPC      90
MB       89
SNC      84
RET      80
ADD      41
REV      24
TPD      16
CGR      16
WPD       5
SUP       4
          4
Name: strBIADecision, dtype: int64


In [10]:
print(tblAppeal_df.info()) 
tblAppeal_df.sample(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 832580 entries, 0 to 869457
Data columns (total 21 columns):
idnAppeal                 832580 non-null int64
idncase                   831001 non-null float64
idnProceeding             740863 non-null float64
strAppealCategory         832580 non-null object
strAppealType             832580 non-null object
datAppealFiled            832426 non-null object
strFiledBy                832570 non-null object
datAttorneyE27            494891 non-null object
datBIADecision            832580 non-null object
strBIADecision            832580 non-null object
strBIADecisionType        808914 non-null object
strCaseType               787158 non-null object
strLang                   738429 non-null object
strNat                    741485 non-null object
strProceedingIHP          358900 non-null object
strCustody                634772 non-null object
strProbono                1841 non-null object
strAppealCategoryDesc     828892 non-null object
strAppea

Unnamed: 0,idnAppeal,idncase,idnProceeding,strAppealCategory,strAppealType,datAppealFiled,strFiledBy,datAttorneyE27,datBIADecision,strBIADecision,...,strCaseType,strLang,strNat,strProceedingIHP,strCustody,strProbono,strAppealCategoryDesc,strAppealTypeDesc,strBIADecisionDesc,strBIADecisionTypeDesc
795227,4969777,6391909.0,5039448.0,IJ,Case Appeal,2011-12-22 00:00:00,A,2011-12-22 00:00:00,2013-04-04 00:00:00,DIS,...,RMV,WEN,CH,,N,,Appeal from Immigration Judge decision,Case Appeal,Dismiss Appeal/Affirm IJ's Decision,Regular Case
409991,4576055,4460027.0,3040111.0,IJ,Case Appeal,2003-04-30 00:00:00,A,2003-04-30 00:00:00,2004-12-01 00:00:00,DIS,...,RMV,CRE,HA,,N,,Appeal from Immigration Judge decision,Case Appeal,Dismiss Appeal/Affirm IJ's Decision,Percuriam Case
808991,4983649,5532732.0,3986666.0,IJ,MTR BIA,2012-05-21 00:00:00,A,2012-05-21 00:00:00,2012-06-22 00:00:00,TER,...,RMV,MAN,CH,,N,,Appeal from Immigration Judge decision,MTR BIA,Termination,Regular Case


### Merge Them 

In [11]:
# convert appeal indexes to integers for joins 
tblAppeal['idncase'] = tblAppeal['idncase'].astype(int) 
tblAppeal['idnProceeding'] = tblAppeal['idnProceeding'].astype(int) 

In [12]:
# joins datasets 
df = master_dunn.merge(tblAppeal_df, how='left', 
                       left_on=['idncase', 'idnproceeding'], right_on=['idncase', 'idnProceeding'])
df = df.merge(judge_bio, how='left', on='ij_code')
print(df.columns.tolist())
df.info()

[u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', 'original_dec_type_string', u'_mdectype', u'outcome_recorded_in_field', 'original_dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'appl_dec', u'_mappln', u'application_type_string', u'_mapplcode', u'application_dec_string', u'_mappldec', u'cityid', u'judgeid', u'natid', u'comp_year', u'comp_month', u'comp_day', u'comp_date', u'osc_year', u'osc_mont

# Summary Stats

In [13]:
# add/reformat common variables between appeals and non-appeals before splitting them 

df['judge_missing_bio'] = np.where(pd.isnull(df['Male_judge']), 1, 0)
df['ij_code_nat'] = df['ij_code'] + '_' + df['nat_string']

df['datAppealFiled_dt'] = pd.to_datetime(df['datAppealFiled'], errors='coerce')
df['datAppealFiled_year'] = df['datAppealFiled_dt'].dt.year
df['datAppealFiled_month'] = df['datAppealFiled_dt'].dt.month 
df['datAppealFiled_year_month'] = (df['datAppealFiled_year'] - 1970) + df['datAppealFiled_month']

df['datBIADecision_dt'] = pd.to_datetime(df['datBIADecision'], errors='coerce')
df['datBIADecision_year'] = df['datBIADecision_dt'].dt.year
df['datBIADecision_month'] = df['datBIADecision_dt'].dt.month 
df['datBIADecision_year_month'] = (df['datBIADecision_year'] - 1970) + df['datBIADecision_month']

In [14]:
# check % of original proceedings that were granted 
original_cases_total = df['idnproceeding'].nunique() 
original_cases_granted = df[df['original_granted'] == 1]['idnproceeding'].nunique() 
original_cases_granted_pct = float(original_cases_granted) / original_cases_total
print("Of the {} original proceedings, {} ({:.1f}%) were granted asylum.".format(original_cases_total, 
                                                                                 original_cases_granted,
                                                                                 100 * original_cases_granted_pct))

Of the 602500 original proceedings, 213731 (35.5%) were granted asylum.


In [15]:
# check % of denied proceedings that appealed 
denied_cases = df[df['original_granted'] == 0].copy() 
denied_cases['appealed'] = np.where(pd.notnull(denied_cases['idnAppeal']), 1, 0) 
denied_cases_total = denied_cases['idnproceeding'].nunique() 
denied_cases_appealed = denied_cases[denied_cases['appealed'] == 1]['idnproceeding'].nunique() 
denied_cases_appealed_pct = float(denied_cases_appealed) / denied_cases_total
print("Of the {} denied proceedings, {} ({:.1f}%) appealed the decision.".format(denied_cases_total, 
                                                                                 denied_cases_appealed,
                                                                                 100 * denied_cases_appealed_pct))

Of the 388769 denied proceedings, 260049 (66.9%) appealed the decision.


# Finalize Scope of Appeals

In this section we will: 
- Drop appeals outside relevant scope, defined to be ('Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA') 
- Deduplicate multiple appeals tied to the same proceeding (by taking the appeal with the last BIA Decision) 
- Drop appeals without mandatory features ('datAppealFiled_year', 'case_type_string') 
- Group appeal outcomes into 'positive' vs. 'negative' binary labels; a small subset deemed to be 'neutral' (e.g. dismissal due to incomplete paperwork) is also dropped. 

Note that we also implicitly dropped appeals made by government by subsetting appeals from the denied proceedings (i.e. government is likely to contest verdicts in favor of respondents rather than the opposite). 

In [17]:
# check appeal and case types 
appeals = denied_cases[denied_cases['appealed'] == 1]
appeals.groupby(['strAppealTypeDesc', 'case_type_string']).size().unstack().fillna(0) 

case_type_string,ASYLUM ONLY CASE,DEPORTATION,EXCLUSION,REMOVAL,WITHHOLDING ONLY
strAppealTypeDesc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Appeal of IJ MTR,91.0,2615.0,476.0,2927.0,6.0
Bond Appeal,10.0,524.0,2.0,2154.0,4.0
Bond MTR Reconsider,0.0,0.0,0.0,12.0,0.0
Bond MTR Reinstate,0.0,0.0,0.0,4.0,0.0
Bond MTR Reopen,0.0,0.0,0.0,10.0,0.0
Case Appeal,3718.0,64680.0,17779.0,167200.0,770.0
Circuit Court Remand,176.0,588.0,134.0,6804.0,22.0
Interlocutory Appeal,4.0,53.0,23.0,377.0,7.0
MTR BIA,1215.0,21485.0,5667.0,58336.0,82.0


In [18]:
# filter for relevant appeal types 
selected_appeal_types = ['Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA'] 
appeals = appeals[appeals['strAppealTypeDesc'].isin(selected_appeal_types)] 
print("After filtering for relevant appeal types, {} rows remain".format(len(appeals)))

After filtering for relevant appeal types, 355238 rows remain


In [19]:
# de-duplicate multiple appeals (each case-proceeding should be unique) by retaining the last appeal 
appeals = appeals.sort_values(by=['idncase', 'idnProceeding', 'datBIADecision_dt'], 
                              ascending=[True, True, False])
appeals.drop_duplicates(subset=['idncase', 'idnProceeding'], keep='first', inplace=True)
print("After deduplicating multiple appeals, {} rows remain".format(len(appeals)))

After deduplicating multiple appeals, 259029 rows remain


In [20]:
# drop appeals without mandatory features 
mandatory_features = ['datAppealFiled_year', 'case_type_string'] 
appeals.dropna(subset=mandatory_features, inplace=True)
print("After dropping appeals without mandatory features, {} rows remain".format(len(appeals)))

After dropping appeals without mandatory features, 258975 rows remain


In [21]:
# designate appeal decision type 
positive_labels = ['Background Check Remand', 'Grant With No Remand', 'Granted', 'Remand', 
                   'Sustain', 'Temporary Protected Status', 'Termination']
negative_labels = ['Denied', "Dismiss Appeal/Affirm IJ's Decision", 'Dismissed (Grant V/D 30 days)', 
                   'Dismissed (Voluntary Departure Granted)', 'Rejection', 'SUMMARY AFFIRMANCE/VD', 
                   'Summary Affirmance', 'Summary Dismiss', 'Summary Dismissal (O) Other', 
                   'Summary Dismissal (a) inad reason on appeal', 'Summary Dismissal - Both (a) & (e)']
appeals['granted'] = np.where(appeals['strBIADecisionDesc'].isin(positive_labels), 1, 
                              np.where(appeals['strBIADecisionDesc'].isin(negative_labels), 0, None))
appeals.dropna(subset=['granted'], inplace=True)
print("After dropping appeals with neutral outcomes, {} rows remain".format(len(appeals)))

After dropping appeals with neutral outcomes, 242466 rows remain


In [22]:
# summarize appeal outcomes 
total_appeals = len(appeals)
successful_appeals = appeals['granted'].sum() 
successful_appeals_pct = float(successful_appeals) / total_appeals
print("Of the {} appeals, {} ({:.1f}%) were successful.".format(total_appeals, successful_appeals, 
                                                                successful_appeals_pct * 100))

Of the 242466 appeals, 78522 (32.4%) were successful.


# Additional Feature Engineering

#### Group nationalities and judges with few samples  

In [None]:
def get_feature_values_to_retain(df, feature_name, min_samples): 
    """ Returns a list of feature values that meet min_samples """
    distinct_values = df[feature_name].value_counts() 
    retain_values = distinct_values[distinct_values >= min_samples].index.tolist() 
    print("{} distinct values of {} will be retained as unique values, remaining {} will be grouped as other.".format(
        len(retain_values), feature_name, len(distinct_values) - len(retain_values)))
    return retain_values 

In [None]:
# apply to judges 
ij_code_to_retain = get_feature_values_to_retain(appeals, feature_name='ij_code', min_samples=50)
appeals['ij_code_grouped'] = np.where(appeals['ij_code'].isin(ij_code_to_retain), appeals['ij_code'], 'other')
non_appeals['ij_code_grouped'] = np.where(non_appeals['ij_code'].isin(ij_code_to_retain), non_appeals['ij_code'], 'other')

In [None]:
# apply to nationalities 
nat_string_to_retain = get_feature_values_to_retain(appeals, feature_name='nat_string', min_samples=50)
appeals['nat_grouped'] = np.where(appeals['nat_string'].isin(nat_string_to_retain), appeals['nat_string'], 'other')
non_appeals['nat_grouped'] = np.where(non_appeals['nat_string'].isin(nat_string_to_retain), non_appeals['nat_string'], 'other')

In [None]:
# apply to judge-nationalities 
ij_code_nat_to_retain = get_feature_values_to_retain(appeals, feature_name='ij_code_nat', min_samples=50)
appeals['ij_code_nat_grouped'] = np.where(appeals['ij_code_nat'].isin(ij_code_nat_to_retain), appeals['ij_code_nat'], 'other')
non_appeals['ij_code_nat_grouped'] = np.where(non_appeals['ij_code_nat'].isin(ij_code_nat_to_retain), non_appeals['ij_code_nat'], 'other')

In [None]:
# apply to hearing_loc_code
hearing_loc_code_to_retain = get_feature_values_to_retain(appeals, feature_name='hearing_loc_code', min_samples=50)
appeals['hearing_loc_code_grouped'] = np.where(appeals['hearing_loc_code'].isin(hearing_loc_code_to_retain), appeals['hearing_loc_code'], 'other')
non_appeals['hearing_loc_code_grouped'] = np.where(non_appeals['hearing_loc_code'].isin(hearing_loc_code_to_retain), non_appeals['hearing_loc_code'], 'other')

#### Judge Experience 

As proxied by two variables:  
- years_since_appointed = YEAR(Original proceeding decision) - YEAR(Judge Appointment) 
- years_since_law_school = YEAR(Original proceeding decision) - YEAR(Law School) 

In [None]:
def get_time_delta(df, feature_year_before, feature_year_after, default_value=-1): 
    """ Computes difference between feature_year_before and feature_year_after, 
        filling NaNs and negative values with -1 """
    time_delta = df[feature_year_after] - df[feature_year_before]
    time_delta = np.where((time_delta < 0) | pd.isnull(time_delta), default_value, time_delta)
    return time_delta

In [None]:
# years since judge appointment 
appeals['years_since_judge_appointment'] = get_time_delta(appeals, 'Year_Appointed_SLR', 'comp_year')
non_appeals['years_since_judge_appointment'] = get_time_delta(non_appeals, 'Year_Appointed_SLR', 'comp_year')

In [None]:
# years since law school 
appeals['years_since_law_school'] = get_time_delta(appeals, 'Year_Law_school_SLR', 'comp_year')
non_appeals['years_since_law_school'] = get_time_delta(non_appeals, 'Year_Law_school_SLR', 'comp_year')

#### Average Appeal Grant Rate from Preceeding N Appeals

In [None]:
def break_into_chunks(data, dimension, max_chunk): 
    """ Returns a dictionary of lists to instruct breaking up dataset into suitable chunks, 
        where resulting rows from self-join on ij_code does not exceed max_df_rows """
    dimensions = pd.DataFrame(data.groupby(dimension).size().sort_values(ascending=False)) 
    dimensions = dimensions.rename(columns={0: 'rows'}).reset_index() 
    dimensions['self_join'] = dimensions['rows'] ** 2
    
    # stop if dimension has too many rows exceeding max_chunk 
    exceeds_max_chunk = dimensions[dimensions['self_join'] > max_chunk]
    if len(exceeds_max_chunk) > 0: 
        print(exceeds_max_chunk)
        raise ValueError('Dimension has too many rows!') 
    else: pass 
    
    dimensions['self_join_cumulative'] = dimensions['self_join'].cumsum() 
    dimensions['chunk'] = np.floor(dimensions['self_join_cumulative'] / max_chunk).astype(int)
    chunk_assignments = dimensions.groupby('chunk')[dimension].apply(list).to_dict()
    print("Split {} labels in {} dimension into {} chunks...".format(len(dimensions), dimension, 
                                                                                  len(chunk_assignments)))
    return chunk_assignments

def compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n): 
    """ Run compute for a given chunk of data """
    df = data_chunk.merge(ref_chunk, how='left', on=dimension)
    results = df[df['datBIADecision_dt'] < df['datAppealFiled_dt']].groupby('idnAppeal').apply(
        lambda f: f.head(last_n)['granted'].mean()) 
    return results 

def compute_last_n_decisions(data, dimension, new_feature_name, max_chunk=50000000, last_n=10): 
    """ Unified method to compute last n decisions """
    
    # get chunk assignments 
    chunk_assignments = break_into_chunks(data, dimension, max_chunk)
    
    # initialize empty list 
    results = [] 
    start = time.time() 
    
    # loop through each chunk 
    for chunk, selected in chunk_assignments.iteritems(): 
        start_chunk = time.time() 
        data_variables = ['idnAppeal', 'datAppealFiled_dt'] + [dimension]
        ref_variables = ['datBIADecision_dt', 'granted'] + [dimension] 
        data_chunk = data[data[dimension].isin(selected)][data_variables]
        ref_chunk = data[data[dimension].isin(selected)][ref_variables].sort_values(
            by=[dimension] + ['datBIADecision_dt'], ascending=[True, False])  
        result = compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n)
        results.append(result)
        print("Chunk {} completed in {} seconds".format(chunk, time.time() - start_chunk))
        
    print("DONE: Last {} decisions computed for {} dimension in {} seconds".format(last_n, dimension, 
                                                                             time.time() - start))
    
    return pd.DataFrame(pd.concat(results), columns=[new_feature_name])  

In [None]:
appeals_subset = appeals[['idnAppeal', 'datAppealFiled_dt', 'datBIADecision_dt', 'granted', 
                          'ij_code_grouped', 'nat_grouped', 'ij_code_nat']].copy()

In [None]:
last_10_appeal_grant_by_judge = compute_last_n_decisions(appeals_subset, 'ij_code_grouped', 
                                                         'last_10_appeal_grant_by_judge', 
                                                         max_chunk=50000000, last_n=10)
appeals = appeals.merge(last_10_appeal_grant_by_judge, how='left', left_on='idnAppeal', right_index=True)

In [None]:
last_10_appeal_grant_by_judge_nat = compute_last_n_decisions(appeals_subset, 'ij_code_nat', 
                                                             'last_10_appeal_grant_by_judge_nat', 
                                                             max_chunk=50000000, last_n=10)
appeals = appeals.merge(last_10_appeal_grant_by_judge_nat, how='left', left_on='idnAppeal', right_index=True)

# Output to CSV 

In [None]:
print(appeals.columns.tolist())

In [None]:
# subset variables to keep 
variables_to_keep = ['idncase', 'idnproceeding', 'idnAppeal', 'nat_string', 'ij_code', 'attorney_flag',
                     'lawyer', 'defensive', 'affirmative', 'oral', 'written', 'base_city_code', 'base_city_state', 
                     'hearing_loc_city', 'hearing_loc_code', 'hearing_loc_state', 'venue_change', 
                     'comp_year', 'comp_month', 'comp_day', 
                     'osc_year', 'osc_month', 'osc_day', 'input_year', 'input_month', 'input_day',
                     'adj_time_start', 'case_type_string', 'original_dec_type_string', 'original_dec_string',
                     'deport', 'relief_granted', 'remove', 'terminated', 'voluntary_departure', 
                     'deport_form', 'voluntary_form', 'deportation_proceeding', 
                     'exclusion_proceeding', 'removal_proceeding', 'asylum_only_proceeding', 
                     'withholding_only_proceeding', 'strCustody', 'strProbono', 
                     'datAppealFiled_year', 'datAppealFiled_month', 'datAppealFiled_year_month', 
                     'datBIADecision_dt', 'datBIADecision_year', 'datBIADecision_month', 
                     'datBIADecision_year_month', 'ij_code_grouped', 'nat_grouped', 'ij_code_nat', 
                     'ij_code_nat_grouped', 'hearing_loc_code_grouped', 
                     'years_since_judge_appointment', 'years_since_law_school', 
                     'last_10_appeal_grant_by_judge', 'last_10_appeal_grant_by_judge_nat'] 

In [None]:
# output to csv 
appeals_fp = os.path.join(DATAFOLDER, 'data_for_model/appeals_data_2018-05-13-v6.csv') 
appeals[variables_to_keep + ['granted']].to_csv(appeals_fp, encoding='utf-8', index=False)
appeals.info() 

In [None]:
# MAYBE TODO: output non-appeals for prediction as well 

# Ignore Below

In [None]:
# helper methods for Bayesian adjustment 

# def calibrate_beta_priors(prior_mean, effective_sample_size): 
#     """
#     Takes aggregate rate and return Beta priors (alpha, beta) with prior mean approximating aggregate rate
#     with respect to effective sample size chosen 
#     """
    
#     rounded_rate = np.round(prior_mean, 1)
#     alpha = int(rounded_rate * effective_sample_size) 
#     beta = effective_sample_size - alpha 
    
#     return alpha, beta

# def compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total): 
#     """ 
#     Takes Beta priors (alpha, beta) along with observed data (num_total, num_positives) 
#     and returns posterior mean 
#     """
    
#     updated_alpha = alpha_prior + num_positives 
#     updated_beta = beta_prior + num_total - num_positives 
    
#     posterior_mean = float(updated_alpha) / (updated_alpha + updated_beta)
    
#     return posterior_mean 

# def compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total): 
#     """ 
#     Takes Beta priors (alpha, beta) along with observed data (num_total, num_positives) 
#     and returns posterior mean 
#     """
    
#     updated_alpha = alpha_prior + num_positives 
#     updated_beta = beta_prior + num_total - num_positives 
    
#     posterior_mean = float(updated_alpha) / (updated_alpha + updated_beta)
    
#     return posterior_mean

# def get_beta_adj_rate(prior_mean, num_positives, num_total, prior_effective_size=10): 
#     """ 
#     Takes aggregate mean as a float (from 0 to 1), num_total (integer), and num_positives (integer) 
#     and return the 'Beta-adjusted' rate. 
#     Example: if in total 30% of Chinese nationality cases were granted, and a specific judge saw 20 cases 
#     and granted 14 of them, input aggregate_mean=0.3, num_total=20, and num_positives=14 
#     """
    
#     if type(prior_mean) is not float: 
#         raise ValueError("Please enter a float for prior mean!")
        
#     if prior_mean < 0 or prior_mean > 1: 
#         raise ValueError("Prior mean must be between 0 and 1!")
            
#     alpha_prior, beta_prior = calibrate_beta_priors(prior_mean, prior_effective_size)
#     posterior_mean = compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total)
    
#     return posterior_mean

In [None]:
# get_beta_adj_rate(prior_mean=.3, num_positives=14, num_total=20, prior_effective_size=0)

In [None]:
# def last_year_grant_rate_by_feature(data, feature, last_year_grant_rate, prior_effective_size=10):
#     df = data.groupby([feature, 'datBIADecision_dt_year', 'granted'])\
#              .size().unstack().fillna(0).reset_index()\
#              .rename(columns={0:'not_granted', 1:'granted'})
#     df = df.merge(last_year_grant_rate, how='left', left_on='datBIADecision_dt_year', right_index=True)\
#            .rename(columns={'datBIADecision_dt_year': 'datBIADecision_dt_year_'}) 
#     new_feature_name = 'last_year_adj_grant_rate_by_{}'.format(feature)
#     df[new_feature_name] = df.apply(
#         lambda r: get_beta_adj_rate(r['last_year_grant_rate'], r['granted'], 
#                                     r['granted'] + r['not_granted'], prior_effective_size), axis=1)
#     results = data.merge(df[['datBIADecision_dt_year_', feature, new_feature_name]], how='left', 
#                          left_on = ['datAppealFiled_dt_year_before', feature],
#                          right_on = ['datBIADecision_dt_year_', feature])\
#                   .drop('datBIADecision_dt_year_', axis=1)  
                    
#     return results 

In [None]:
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'ij_code_grouped', last_year_grant_rate,
#                                                          prior_effective_size=10)

In [None]:
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'nat_string_grouped', last_year_grant_rate,
#                                                          prior_effective_size=10)

In [None]:
# appeals_with_last_year['ij_code_nat'] = appeals_with_last_year['ij_code_grouped'] + '_' + appeals_with_last_year['nat_string_grouped']
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'ij_code_nat', last_year_grant_rate,
#                                                          prior_effective_size=10)