In [1]:
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
import time
pd.set_option('display.max_rows', 200)
%matplotlib inline

# Import & Merge Data 

Imports 3 datasets used for project: (1) Master Proceedings as processed by Dunn, (2) Judge Bios, (3) Appeals

In [2]:
# define folder where data resides 
DATAFOLDER = "~/Documents/data-science-coursework/nyu-ml/project/"

### Master Proceedings (processed by Sagent/Dunn)

In [3]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'data/AsylumAdj/data_for_model/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn.rename(columns={'dec_type_string': 'original_dec_type_string', 
                            'dec_string': 'original_dec_string',
                            'grant': 'original_granted'}, 
                   inplace=True)
master_dunn.drop('Unnamed: 0', axis=1, inplace=True)
print(master_dunn.info()) 
print(master_dunn.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602500 entries, 0 to 602499
Columns: 181 entries, idncase to last_hearing_on_comp_date
dtypes: bool(1), float64(72), int64(38), object(70)
memory usage: 828.0+ MB
None
[u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', 'original_dec_type_string', u'_mdectype', u'outcome_recorded_in_field', 'original_dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'a

### Judge Bios 

In [4]:
judge_bio = pd.read_csv(os.path.join(DATAFOLDER, 'data/AsylumAdj/data/cleaned_judge_bios.csv')) 
judge_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Male_judge               367 non-null float64
Year_Appointed_SLR       367 non-null float64
Year_College_SLR         367 non-null float64
Year_Law_school_SLR      367 non-null float64
Government_Years_SLR     367 non-null float64
Govt_nonINS_SLR          367 non-null float64
INS_Years_SLR            367 non-null float64
Military_Years_SLR       367 non-null float64
NGO_Years_SLR            367 non-null float64
Privateprac_Years_SLR    367 non-null float64
Academia_Years_SLR       367 non-null float64
ij_code                  367 non-null object
dtypes: float64(11), object(1)
memory usage: 34.5+ KB


### Appeals

In [5]:
# import main table 
tblAppeal = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/tblAppeal.csv'), low_memory=False) 
tblAppeal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870388 entries, 0 to 870387
Data columns (total 17 columns):
idnAppeal             870388 non-null int64
idncase               868758 non-null float64
idnProceeding         776380 non-null float64
strAppealCategory     870388 non-null object
strAppealType         870388 non-null object
datAppealFiled        870226 non-null object
strFiledBy            870378 non-null object
datAttorneyE27        521008 non-null object
datBIADecision        847196 non-null object
strBIADecision        847180 non-null object
strBIADecisionType    822882 non-null object
strCaseType           824855 non-null object
strLang               773811 non-null object
strNat                777004 non-null object
strProceedingIHP      367364 non-null object
strCustody            666356 non-null object
strProbono            1880 non-null object
dtypes: float64(2), int64(1), object(14)
memory usage: 112.9+ MB


In [6]:
# import lookup tables (Python 2.7)
bia_appeal_category = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'), 
                                    sheetname='BIA Appeal Category', skip_footer=7)
bia_appeal_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                sheetname='BIA Appeal Type', skip_footer=3)
bia_decision_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision type', skip_footer=2)
bia_decision_code = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision code', skip_footer=2)

In [7]:
# join them 
tblAppeal_df = tblAppeal.merge(bia_appeal_category, how='left', left_on='strAppealCategory', right_on='Code')\
                        .rename(columns={'Description': 'strAppealCategoryDesc'}).drop('Code', axis=1)\
                        .merge(bia_appeal_type, how='left', left_on='strAppealType', right_on='Code')\
                        .rename(columns={'Description': 'strAppealTypeDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_code, how='left', left_on='strBIADecision', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_type, how='left', left_on='strBIADecisionType', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionTypeDesc'}).drop('Code', axis=1)

In [8]:
# drop appeals with no case number, proceeding number, or decision 
tblAppeal.dropna(subset=['idncase', 'idnProceeding', 'strBIADecision'], inplace=True) 
tblAppeal = tblAppeal[tblAppeal['idnProceeding'] != 0] # drop zeros 

In [9]:
# some strBIADecision don't have corresponding code translations; also drop 
print(tblAppeal_df[pd.isnull(tblAppeal_df['strBIADecisionDesc'])]['strBIADecision'].value_counts()) 
tblAppeal_df.dropna(subset=['strBIADecisionDesc'], inplace=True)

DSO    5994
DED    2108
CPG    1704
ABC    1617
APD    1170
GRS     433
DNS     422
ADM     355
DMO     238
AFD     106
CPC      90
MB       89
SNC      84
RET      80
ADD      41
REV      24
TPD      16
CGR      16
WPD       5
SUP       4
          4
Name: strBIADecision, dtype: int64


In [10]:
print(tblAppeal_df.info()) 
tblAppeal_df.sample(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 832580 entries, 0 to 869457
Data columns (total 21 columns):
idnAppeal                 832580 non-null int64
idncase                   831001 non-null float64
idnProceeding             740863 non-null float64
strAppealCategory         832580 non-null object
strAppealType             832580 non-null object
datAppealFiled            832426 non-null object
strFiledBy                832570 non-null object
datAttorneyE27            494891 non-null object
datBIADecision            832580 non-null object
strBIADecision            832580 non-null object
strBIADecisionType        808914 non-null object
strCaseType               787158 non-null object
strLang                   738429 non-null object
strNat                    741485 non-null object
strProceedingIHP          358900 non-null object
strCustody                634772 non-null object
strProbono                1841 non-null object
strAppealCategoryDesc     828892 non-null object
strAppea

Unnamed: 0,idnAppeal,idncase,idnProceeding,strAppealCategory,strAppealType,datAppealFiled,strFiledBy,datAttorneyE27,datBIADecision,strBIADecision,...,strCaseType,strLang,strNat,strProceedingIHP,strCustody,strProbono,strAppealCategoryDesc,strAppealTypeDesc,strBIADecisionDesc,strBIADecisionTypeDesc
109478,109608,2231100.0,348696.0,IJ,MTR BIA,1991-10-15 00:00:00,A,,1991-11-15 00:00:00,DEN,...,DEP,ENG,NI,*,D,,Appeal from Immigration Judge decision,MTR BIA,Denied,Regular Case
572276,4744359,5495056.0,3909424.0,IJ,Case Appeal,2006-06-19 00:00:00,A,,2006-09-01 00:00:00,WDL,...,RMV,ENG,MX,,R,,Appeal from Immigration Judge decision,Case Appeal,Withdrawl of Appeal,Percuriam Case
55008,55068,2968321.0,1298558.0,IJ,Case Appeal,1994-05-03 00:00:00,A,,2000-11-15 00:00:00,DIS,...,DEP,ENG,IR,,,,Appeal from Immigration Judge decision,Case Appeal,Dismiss Appeal/Affirm IJ's Decision,Regular Case


### Merge Them 

In [11]:
# convert appeal indexes to integers for joins 
tblAppeal['idncase'] = tblAppeal['idncase'].astype(int) 
tblAppeal['idnProceeding'] = tblAppeal['idnProceeding'].astype(int) 

In [12]:
# joins datasets 
df = master_dunn.merge(tblAppeal_df, how='left', 
                       left_on=['idncase', 'idnproceeding'], right_on=['idncase', 'idnProceeding'])
df = df.merge(judge_bio, how='left', on='ij_code')
print(df.columns.tolist())
df.info()

[u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', 'original_dec_type_string', u'_mdectype', u'outcome_recorded_in_field', 'original_dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'appl_dec', u'_mappln', u'application_type_string', u'_mapplcode', u'application_dec_string', u'_mappldec', u'cityid', u'judgeid', u'natid', u'comp_year', u'comp_month', u'comp_day', u'comp_date', u'osc_year', u'osc_mont

# Summarize Funnel Stats

In [13]:
# add/reformat common variables between appeals and non-appeals before splitting them 

df['judge_missing_bio'] = np.where(pd.isnull(df['Male_judge']), 1, 0)
df['ij_code_nat'] = df['ij_code'] + '_' + df['nat_string']
df['comp_dt'] = pd.to_datetime(dict(year=df['comp_year'], month=df['comp_month'], day=df['comp_day']))
df['comp_year_month'] = 12 * (df['comp_year'] - 1970) + df['comp_month']

df['datAppealFiled_dt'] = pd.to_datetime(df['datAppealFiled'], errors='coerce')
df['datAppealFiled_year'] = df['datAppealFiled_dt'].dt.year
df['datAppealFiled_month'] = df['datAppealFiled_dt'].dt.month 
df['datAppealFiled_year_month'] = 12 * (df['datAppealFiled_year'] - 1970) + df['datAppealFiled_month']

df['datBIADecision_dt'] = pd.to_datetime(df['datBIADecision'], errors='coerce')
df['datBIADecision_year'] = df['datBIADecision_dt'].dt.year
df['datBIADecision_month'] = df['datBIADecision_dt'].dt.month 
df['datBIADecision_year_month'] = 12 * (df['datBIADecision_year'] - 1970) + df['datBIADecision_month']

In [14]:
# check % of original proceedings that were granted 
original_cases_total = df['idnproceeding'].nunique() 
original_cases_granted = df[df['original_granted'] == 1]['idnproceeding'].nunique() 
original_cases_granted_pct = float(original_cases_granted) / original_cases_total
print("Of the {} original proceedings, {} ({:.1f}%) were granted asylum.".format(original_cases_total, 
                                                                                 original_cases_granted,
                                                                                 100 * original_cases_granted_pct))

Of the 602500 original proceedings, 213731 (35.5%) were granted asylum.


In [15]:
# check % of denied proceedings that appealed 
denied_cases = df[df['original_granted'] == 0].copy() 
denied_cases['appealed'] = np.where(pd.notnull(denied_cases['idnAppeal']), 1, 0) 
denied_cases_total = denied_cases['idnproceeding'].nunique() 
denied_cases_appealed = denied_cases[denied_cases['appealed'] == 1]['idnproceeding'].nunique() 
denied_cases_appealed_pct = float(denied_cases_appealed) / denied_cases_total
print("Of the {} denied proceedings, {} ({:.1f}%) appealed the decision.".format(denied_cases_total, 
                                                                                 denied_cases_appealed,
                                                                                 100 * denied_cases_appealed_pct))

Of the 388769 denied proceedings, 260049 (66.9%) appealed the decision.


In [16]:
# get appeals vs. non-appeals datasets 
appeals = denied_cases[denied_cases['appealed'] == 1].copy()
non_appeals = denied_cases[denied_cases['appealed'] == 0].copy()

# Finalize Scope of Appeals

In this section we will: 
- Drop appeals outside relevant scope, defined to be ('Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA') 
- Deduplicate multiple appeals tied to the same proceeding (by taking the appeal with the last BIA Decision) 
- Drop appeals without mandatory features ('datAppealFiled_year', 'case_type_string') 
- Group appeal outcomes into 'positive' vs. 'negative' binary labels; a small subset deemed to be 'neutral' (e.g. dismissal due to incomplete paperwork) is also dropped. 

Note that we also implicitly dropped appeals made by government by subsetting appeals from the denied proceedings (i.e. government is likely to contest verdicts in favor of respondents rather than the opposite). 

In [17]:
# check appeal and case types 
appeals.groupby(['strAppealTypeDesc', 'case_type_string']).size().unstack().fillna(0) 

case_type_string,ASYLUM ONLY CASE,DEPORTATION,EXCLUSION,REMOVAL,WITHHOLDING ONLY
strAppealTypeDesc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Appeal of IJ MTR,91.0,2615.0,476.0,2927.0,6.0
Bond Appeal,10.0,524.0,2.0,2154.0,4.0
Bond MTR Reconsider,0.0,0.0,0.0,12.0,0.0
Bond MTR Reinstate,0.0,0.0,0.0,4.0,0.0
Bond MTR Reopen,0.0,0.0,0.0,10.0,0.0
Case Appeal,3718.0,64680.0,17779.0,167200.0,770.0
Circuit Court Remand,176.0,588.0,134.0,6804.0,22.0
Interlocutory Appeal,4.0,53.0,23.0,377.0,7.0
MTR BIA,1215.0,21485.0,5667.0,58336.0,82.0


In [18]:
# filter for relevant appeal types 
selected_appeal_types = ['Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA'] 
appeals = appeals[appeals['strAppealTypeDesc'].isin(selected_appeal_types)] 
print("After filtering for relevant appeal types, {} rows remain".format(len(appeals)))

After filtering for relevant appeal types, 355238 rows remain


In [19]:
# de-duplicate multiple appeals (each case-proceeding should be unique) by retaining the last appeal 
appeals = appeals.sort_values(by=['idncase', 'idnProceeding', 'datBIADecision_dt'], 
                              ascending=[True, True, False])
appeals.drop_duplicates(subset=['idncase', 'idnProceeding'], keep='first', inplace=True)
print("After deduplicating multiple appeals, {} rows remain".format(len(appeals)))

After deduplicating multiple appeals, 259029 rows remain


In [20]:
# drop appeals without mandatory features 
mandatory_features = ['datAppealFiled_year', 'case_type_string'] 
appeals.dropna(subset=mandatory_features, inplace=True)
print("After dropping appeals without mandatory features, {} rows remain".format(len(appeals)))

After dropping appeals without mandatory features, 258975 rows remain


In [21]:
# designate appeal decision type 
positive_labels = ['Background Check Remand', 'Grant With No Remand', 'Granted', 'Remand', 
                   'Sustain', 'Temporary Protected Status', 'Termination']
negative_labels = ['Denied', "Dismiss Appeal/Affirm IJ's Decision", 'Dismissed (Grant V/D 30 days)', 
                   'Dismissed (Voluntary Departure Granted)', 'Rejection', 'SUMMARY AFFIRMANCE/VD', 
                   'Summary Affirmance', 'Summary Dismiss', 'Summary Dismissal (O) Other', 
                   'Summary Dismissal (a) inad reason on appeal', 'Summary Dismissal - Both (a) & (e)']
appeals['granted'] = np.where(appeals['strBIADecisionDesc'].isin(positive_labels), 1, 
                              np.where(appeals['strBIADecisionDesc'].isin(negative_labels), 0, None))
appeals.dropna(subset=['granted'], inplace=True)
appeals['granted'] = appeals['granted'].astype(int)
print("After dropping appeals with neutral outcomes, {} rows remain".format(len(appeals)))

After dropping appeals with neutral outcomes, 242466 rows remain


In [22]:
# summarize appeal outcomes 
total_appeals = len(appeals)
successful_appeals = appeals['granted'].sum() 
successful_appeals_pct = float(successful_appeals) / total_appeals
print("Of the {} appeals, {} ({:.1f}%) were successful.".format(total_appeals, successful_appeals, 
                                                                successful_appeals_pct * 100))

Of the 242466 appeals, 78522 (32.4%) were successful.


# Additional Feature Engineering

#### Group nationalities and judges with few samples  

In [23]:
def get_feature_values_to_retain(df, feature_name, min_samples): 
    """ Returns a list of feature values that meet min_samples """
    distinct_values = df[feature_name].value_counts() 
    retain_values = distinct_values[distinct_values >= min_samples].index.tolist() 
    print("{} distinct values of {} will be retained as unique values, remaining {} will be grouped as other.".format(
        len(retain_values), feature_name, len(distinct_values) - len(retain_values)))
    return retain_values 

In [24]:
# apply to judges 
ij_code_to_retain = get_feature_values_to_retain(appeals, feature_name='ij_code', min_samples=50)
appeals['ij_code_grouped'] = np.where(appeals['ij_code'].isin(ij_code_to_retain), appeals['ij_code'], 'other')
non_appeals['ij_code_grouped'] = np.where(non_appeals['ij_code'].isin(ij_code_to_retain), non_appeals['ij_code'], 'other')

368 distinct values of ij_code will be retained as unique values, remaining 53 will be grouped as other.


In [25]:
# apply to nationalities 
nat_string_to_retain = get_feature_values_to_retain(appeals, feature_name='nat_string', min_samples=50)
appeals['nat_grouped'] = np.where(appeals['nat_string'].isin(nat_string_to_retain), appeals['nat_string'], 'other')
non_appeals['nat_grouped'] = np.where(non_appeals['nat_string'].isin(nat_string_to_retain), non_appeals['nat_string'], 'other')

129 distinct values of nat_string will be retained as unique values, remaining 88 will be grouped as other.


In [26]:
# apply to judge-nationalities 
ij_code_nat_to_retain = get_feature_values_to_retain(appeals, feature_name='ij_code_nat', min_samples=50)
appeals['ij_code_nat_grouped'] = np.where(appeals['ij_code_nat'].isin(ij_code_nat_to_retain), appeals['ij_code_nat'], 'other')
non_appeals['ij_code_nat_grouped'] = np.where(non_appeals['ij_code_nat'].isin(ij_code_nat_to_retain), non_appeals['ij_code_nat'], 'other')

861 distinct values of ij_code_nat will be retained as unique values, remaining 18189 will be grouped as other.


In [27]:
# apply to lang 
lang_to_retain = get_feature_values_to_retain(appeals, feature_name='lang', min_samples=50)
appeals['lang_grouped'] = np.where(appeals['lang'].isin(lang_to_retain), appeals['lang'], 'other')
non_appeals['lang_grouped'] = np.where(non_appeals['lang'].isin(lang_to_retain), non_appeals['lang'], 'other')

91 distinct values of lang will be retained as unique values, remaining 206 will be grouped as other.


#### Judge Experience 

As proxied by two variables:  
- years_since_appointed = YEAR(Original proceeding decision) - YEAR(Judge Appointment) 
- years_since_law_school = YEAR(Original proceeding decision) - YEAR(Law School) 

In [28]:
def get_time_delta(df, before, after, default_value=-1): 
    """ Computes difference between feature_year_before and feature_year_after, 
        filling NaNs and negative values with -1 """
    try: 
        if (df[before].dtype == 'float' or df[before].dtype == 'int') or (df[after].dtype == 'float' or df[after].dtype == 'int'): 
            time_delta = df[after] - df[before]
        elif df[before].dtype == 'datetime64[ns]' and df[after].dtype == 'datetime64[ns]': 
            time_delta = (df[after] - df[before]).dt.days
        time_delta = np.where((time_delta < 0) | pd.isnull(time_delta), default_value, time_delta)
    except: 
        raise ValueError("Please use same datatype for 'before' and 'after'.") 
    
    return time_delta

In [29]:
# years since judge appointment 
appeals['years_since_judge_appointment'] = get_time_delta(appeals, 'Year_Appointed_SLR', 'comp_year')
non_appeals['years_since_judge_appointment'] = get_time_delta(non_appeals, 'Year_Appointed_SLR', 'comp_year')

In [30]:
# years since law school 
appeals['years_since_law_school'] = get_time_delta(appeals, 'Year_Law_school_SLR', 'comp_year')
non_appeals['years_since_law_school'] = get_time_delta(non_appeals, 'Year_Law_school_SLR', 'comp_year')

#### Time Elapsed Between OSC vs. Input vs. Comp vs. Appeal dates

In [31]:
# osc is when charge is filed, input date is when proceeding began, and comp date is when decision/ruling was made
appeals['appeal_days_elapsed_since_comp_date'] = get_time_delta(appeals, "comp_dt", "datAppealFiled_dt")
appeals['comp_days_elasped_since_input_date'] = get_time_delta(appeals, "input_date", "comp_date")
appeals['input_days_elapsed_since_osc_date'] = get_time_delta(appeals, "osc_date", "input_date")
non_appeals['comp_days_elasped_since_input_date'] = get_time_delta(non_appeals, "input_date", "comp_date")
non_appeals['input_days_elapsed_since_osc_date'] = get_time_delta(non_appeals, "osc_date", "input_date")

In [32]:
# Since non-appeals don't have appeal dates, we assume they would have filed 28 days (median of appeals) after comp date
non_appeals['appeal_days_elapsed_since_comp_date'] = appeals['appeal_days_elapsed_since_comp_date'].median()
non_appeals['datAppealFiled_dt'] = non_appeals['comp_dt'] + pd.to_timedelta(non_appeals['appeal_days_elapsed_since_comp_date'], unit='D')
non_appeals['datAppealFiled_year'] = non_appeals['datAppealFiled_dt'].dt.year
non_appeals['datAppealFiled_month'] = non_appeals['datAppealFiled_dt'].dt.month 
non_appeals['datAppealFiled_year_month'] = (non_appeals['datAppealFiled_year'] - 1970) + non_appeals['datAppealFiled_month']

#### Mismatch between Base and Hearing Locations

In [33]:
def check_hearing_loc_match_base(row): 
    """ Checks whether base and hearing location are the same, different city, or different state """
    if pd.isnull(row['base_city_state']) | pd.isnull(row['hearing_loc_state']):
        return 'missing_info'
    elif row['base_city_code'] == row['hearing_loc_code']:
        return 'same_city'
    elif row['base_city_state'] == row['hearing_loc_state']:
        return 'diff_city_same_state'
    else: 
        return 'diff_state'
    
appeals['hearing_loc_match_base'] = appeals.apply(check_hearing_loc_match_base, axis=1)
non_appeals['hearing_loc_match_base'] = non_appeals.apply(check_hearing_loc_match_base, axis=1)
appeals['hearing_loc_match_base'].value_counts()

same_city               221358
diff_city_same_state     17859
diff_state                3237
missing_info                12
Name: hearing_loc_match_base, dtype: int64

#### Average Appeal Grant Rate from Preceeding N Appeals

In [34]:
def break_into_chunks(data, dimension, max_chunk): 
    """ Returns a dictionary of lists to instruct breaking up dataset into suitable chunks, 
        where resulting rows from self-join on ij_code does not exceed max_df_rows """
    dimensions = pd.DataFrame(data.groupby(dimension).size().sort_values(ascending=False)) 
    dimensions = dimensions.rename(columns={0: 'rows'}).reset_index() 
    dimensions['self_join'] = dimensions['rows'] ** 2
    
    # stop if dimension has too many rows exceeding max_chunk 
    exceeds_max_chunk = dimensions[dimensions['self_join'] > max_chunk]
    if len(exceeds_max_chunk) > 0: 
        print(exceeds_max_chunk)
        raise ValueError('Dimension has too many rows!') 
    else: pass 
    
    dimensions['self_join_cumulative'] = dimensions['self_join'].cumsum() 
    dimensions['chunk'] = np.floor(dimensions['self_join_cumulative'] / max_chunk).astype(int)
    chunk_assignments = dimensions.groupby('chunk')[dimension].apply(list).to_dict()
    print("Split {} labels in {} dimension into {} chunks...".format(len(dimensions), dimension, 
                                                                                  len(chunk_assignments)))
    return chunk_assignments

def compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n): 
    """ Run compute for a given chunk of data """
    df = data_chunk.merge(ref_chunk, how='left', on=dimension)
    results = df[df['datBIADecision_dt'] < df['datAppealFiled_dt']].groupby('idnproceeding').apply(
        lambda f: f.head(last_n)['granted'].mean()) 
    return results 

def compute_last_n_decisions(data, ref, dimension, new_feature_name, max_chunk=50000000, last_n=10): 
    """ Unified method to compute last n decisions """
    
    # get chunk assignments 
    chunk_assignments = break_into_chunks(data, dimension, max_chunk)
    
    # initialize empty list 
    results = [] 
    start = time.time() 
    
    # loop through each chunk 
    for chunk, selected in chunk_assignments.iteritems(): 
        start_chunk = time.time() 
        data_variables = ['idnproceeding', 'datAppealFiled_dt'] + [dimension]
        ref_variables = ['datBIADecision_dt', 'granted'] + [dimension] 
        data_chunk = data[data[dimension].isin(selected)][data_variables]
        ref_chunk = ref[ref[dimension].isin(selected)][ref_variables].sort_values(
            by=[dimension] + ['datBIADecision_dt'], ascending=[True, False])  
        result = compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n)
        results.append(result)
        print("Chunk {} completed in {} seconds".format(chunk, time.time() - start_chunk))
        
    print("DONE: Last {} decisions computed for {} dimension in {} seconds".format(last_n, dimension, 
                                                                             time.time() - start))
    
    return pd.DataFrame(pd.concat(results), columns=[new_feature_name])  

def add_last_n_decisions(data, ref, dimension, new_feature_name, last_n=10, max_chunk=50000000):
    """ Takes full dataframe, adds last n decisions as a new column, returns new df """ 
    last_n_grant_rate = compute_last_n_decisions(data, ref, dimension, new_feature_name, max_chunk, last_n)
    df = data.merge(last_n_grant_rate, how='left', left_on='idnproceeding', right_index=True)
    return df 

In [35]:
# last 10 by judge, for appeals 
appeals = add_last_n_decisions(data=appeals, ref=appeals, dimension='ij_code_grouped', 
                               new_feature_name='last_10_appeal_grant_by_judge', last_n=10, max_chunk=50000000)

Split 369 labels in ij_code_grouped dimension into 7 chunks...
Chunk 0 completed in 18.727699995 seconds
Chunk 1 completed in 22.1483209133 seconds
Chunk 2 completed in 20.451210022 seconds
Chunk 3 completed in 28.0813579559 seconds
Chunk 4 completed in 27.8097729683 seconds
Chunk 5 completed in 31.5132110119 seconds
Chunk 6 completed in 7.71881604195 seconds
DONE: Last 10 decisions computed for ij_code_grouped dimension in 156.451397896 seconds


In [36]:
# last 10 by judge, for non-appeals  
non_appeals = add_last_n_decisions(data=non_appeals, ref=appeals, dimension='ij_code_grouped', 
                                   new_feature_name='last_10_appeal_grant_by_judge', last_n=10, max_chunk=50000000)

Split 369 labels in ij_code_grouped dimension into 3 chunks...
Chunk 0 completed in 15.2609629631 seconds
Chunk 1 completed in 58.1031141281 seconds
Chunk 2 completed in 4.01359295845 seconds
DONE: Last 10 decisions computed for ij_code_grouped dimension in 77.3780281544 seconds


In [37]:
# last 10 by judge+nat, for appeals 
appeals = add_last_n_decisions(data=appeals, ref=appeals, dimension='ij_code_nat', 
                               new_feature_name='last_10_appeal_grant_by_judge_nat', last_n=10, max_chunk=50000000)

Split 19050 labels in ij_code_nat dimension into 2 chunks...
Chunk 0 completed in 32.949960947 seconds
Chunk 1 completed in 37.6315829754 seconds
DONE: Last 10 decisions computed for ij_code_nat dimension in 70.5818359852 seconds


In [38]:
# last 10 by judge+nat, for non-appeals 
non_appeals = add_last_n_decisions(data=non_appeals, ref=appeals, dimension='ij_code_nat', 
                                   new_feature_name='last_10_appeal_grant_by_judge_nat', last_n=10, max_chunk=50000000)

Split 15364 labels in ij_code_nat dimension into 1 chunks...
Chunk 0 completed in 29.716960907 seconds
DONE: Last 10 decisions computed for ij_code_nat dimension in 29.7170619965 seconds


# Output to CSV 

### Features to Retain 

In [39]:
## ID features 
id_features = ['idncase', 'idnproceeding', 'idnAppeal']

In [40]:
## Respondent features 
respondent_features = ['nat_grouped', 'lang_grouped']

# excluded: 'ij_code_nat', 'nat_string', 'lang' 

In [41]:
## Judge features 
judge_features = ['ij_code_grouped', 'Male_judge', 'Year_Appointed_SLR', 'Year_College_SLR', 'Year_Law_school_SLR', 
                  'Government_Years_SLR', 'Govt_nonINS_SLR', 'INS_Years_SLR', 'Military_Years_SLR', 'NGO_Years_SLR', 
                  'Privateprac_Years_SLR', 'Academia_Years_SLR', 'judge_missing_bio',
                  'years_since_judge_appointment', 'years_since_law_school', 
                  'last_10_appeal_grant_by_judge', 'last_10_appeal_grant_by_judge_nat']

# excluded: 'ij_code', 'judgeid', 'judge_name_caps' 

In [42]:
## Proceeding features 
proceeding_features = ['lawyer', 'defensive', 'affirmative', 'oral', 'written',
                       'case_type_string', 'original_dec_string']

# excluded: 'deportation_proceeding'-'exclusion_proceeding'-'removal_proceeding'-'asylum_only_proceeding'-
# 'withholding_only_proceeding' (all included in case_type_string) original_dec_type_string' (same as oral/written),
#  'deport', 'relief_granted', 'remove', 'terminated', 'voluntary_departure' (same as original_dec_string categorical)
# 'deport_form', 'voluntary_form' (not sure what these are), 'interpreter_code (doesn't vary)

In [43]:
## Appeal features
appeal_features = ['strCustody', 'strProbono']
# excluded 

In [44]:
## Location features 
location_features = ['base_city_code', 'hearing_loc_match_base'] 
# excluded: 'base_city_zip5' (same as base_city_code), 'base_city_state' (slightly less granular than base_city_code),
# 'venue_change' (rarely populated), 'hearing_loc_city' (too many), 'hearing_loc_code', 'hearing_loc_state'

In [45]:
## Time features 
time_features = ['datAppealFiled_year', 'datAppealFiled_year_month', 'comp_year', 'comp_year_month', 
                 'comp_days_elasped_since_input_date', 'input_days_elapsed_since_osc_date', 
                 'appeal_days_elapsed_since_comp_date']

# excluded 'appl_year', 'appl_recd_date', 'adj_time_start',  
# 'osc_year', 'osc_month', 'osc_day', 'osc_date',  input_year', 'input_month', 'input_day', 'input_date'
# 'comp_month', 'comp_day', 'datAppealFiled_year_month', 'datBIADecision_dt', 'datBIADecision_month'

In [46]:
# Features to keep 
features_to_keep = id_features + respondent_features + judge_features + proceeding_features\
    + appeal_features + location_features + time_features
print(features_to_keep)

['idncase', 'idnproceeding', 'idnAppeal', 'nat_grouped', 'lang_grouped', 'ij_code_grouped', 'Male_judge', 'Year_Appointed_SLR', 'Year_College_SLR', 'Year_Law_school_SLR', 'Government_Years_SLR', 'Govt_nonINS_SLR', 'INS_Years_SLR', 'Military_Years_SLR', 'NGO_Years_SLR', 'Privateprac_Years_SLR', 'Academia_Years_SLR', 'judge_missing_bio', 'years_since_judge_appointment', 'years_since_law_school', 'last_10_appeal_grant_by_judge', 'last_10_appeal_grant_by_judge_nat', 'lawyer', 'defensive', 'affirmative', 'oral', 'written', 'case_type_string', 'original_dec_string', 'strCustody', 'strProbono', 'base_city_code', 'hearing_loc_match_base', 'datAppealFiled_year', 'datAppealFiled_year_month', 'comp_year', 'comp_year_month', 'comp_days_elasped_since_input_date', 'input_days_elapsed_since_osc_date', 'appeal_days_elapsed_since_comp_date']


In [47]:
# output appeals dataset to csv 
appeals_fp = os.path.join(DATAFOLDER, 'data_for_model/appeals_data_final.csv') 
appeals_final = appeals[features_to_keep + ['granted']]
appeals_final.to_csv(appeals_fp, encoding='utf-8', index=False)
appeals_final.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242466 entries, 498640 to 702259
Data columns (total 41 columns):
idncase                                242466 non-null int64
idnproceeding                          242466 non-null int64
idnAppeal                              242466 non-null float64
nat_grouped                            242466 non-null object
lang_grouped                           242466 non-null object
ij_code_grouped                        242466 non-null object
Male_judge                             226481 non-null float64
Year_Appointed_SLR                     226481 non-null float64
Year_College_SLR                       226481 non-null float64
Year_Law_school_SLR                    226481 non-null float64
Government_Years_SLR                   226481 non-null float64
Govt_nonINS_SLR                        226481 non-null float64
INS_Years_SLR                          226481 non-null float64
Military_Years_SLR                     226481 non-null float64
NGO_Years

In [48]:
# output non-appeals dataset to csv 
non_appeals_fp = os.path.join(DATAFOLDER, 'data_for_model/non_appeals_data_final.csv') 
non_appeals_final = non_appeals[features_to_keep]
non_appeals_final.to_csv(non_appeals_fp, encoding='utf-8', index=False)
non_appeals_final.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128720 entries, 8 to 702267
Data columns (total 40 columns):
idncase                                128720 non-null int64
idnproceeding                          128720 non-null int64
idnAppeal                              0 non-null float64
nat_grouped                            128720 non-null object
lang_grouped                           128720 non-null object
ij_code_grouped                        128720 non-null object
Male_judge                             119610 non-null float64
Year_Appointed_SLR                     119610 non-null float64
Year_College_SLR                       119610 non-null float64
Year_Law_school_SLR                    119610 non-null float64
Government_Years_SLR                   119610 non-null float64
Govt_nonINS_SLR                        119610 non-null float64
INS_Years_SLR                          119610 non-null float64
Military_Years_SLR                     119610 non-null float64
NGO_Years_SLR      