In [262]:
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
import time
pd.set_option('display.max_rows', 200)
%matplotlib inline

# Import Data 

In [263]:
# define folder where data resides 
DATAFOLDER = "~/Documents/data-science-coursework/nyu-ml/project/"

### Appeals

In [264]:
# main table 
tblAppeal = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/tblAppeal.csv'), low_memory=False) 
print(tblAppeal.info())
tblAppeal.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870388 entries, 0 to 870387
Data columns (total 17 columns):
idnAppeal             870388 non-null int64
idncase               868758 non-null float64
idnProceeding         776380 non-null float64
strAppealCategory     870388 non-null object
strAppealType         870388 non-null object
datAppealFiled        870226 non-null object
strFiledBy            870378 non-null object
datAttorneyE27        521008 non-null object
datBIADecision        847196 non-null object
strBIADecision        847180 non-null object
strBIADecisionType    822882 non-null object
strCaseType           824855 non-null object
strLang               773811 non-null object
strNat                777004 non-null object
strProceedingIHP      367364 non-null object
strCustody            666356 non-null object
strProbono            1880 non-null object
dtypes: float64(2), int64(1), object(14)
memory usage: 112.9+ MB
None


Unnamed: 0,idnAppeal,idncase,idnProceeding,strAppealCategory,strAppealType,datAppealFiled,strFiledBy,datAttorneyE27,datBIADecision,strBIADecision,strBIADecisionType,strCaseType,strLang,strNat,strProceedingIHP,strCustody,strProbono
843508,5018477,6213470.0,4846677.0,IJ,Case Appeal,2013-05-23 00:00:00,A,,,,,RMV,SP,HO,,N,
691238,4864853,2581024.0,789100.0,IJ,MTR BIA,2009-05-04 00:00:00,A,2009-05-04 00:00:00,2009-06-24 00:00:00,DEN,R,DEP,SP,GT,,D,
263641,4422054,3716986.0,2078008.0,IJ,Case Appeal,1999-06-07 00:00:00,A,1999-06-07 00:00:00,2002-04-23 00:00:00,SAV,P,RMV,SP,GT,,N,


In [265]:
# drop appeals with no case number, proceeding number, or decision 
tblAppeal.dropna(subset=['idncase', 'idnProceeding', 'strBIADecision'], inplace=True) 
tblAppeal = tblAppeal[tblAppeal['idnProceeding'] != 0] # drop zeros 

# convert indexes to integers 
tblAppeal['idncase'] = tblAppeal['idncase'].astype(int) 
tblAppeal['idnProceeding'] = tblAppeal['idnProceeding'].astype(int) 
tblAppeal.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 755222 entries, 0 to 869457
Data columns (total 17 columns):
idnAppeal             755222 non-null int64
idncase               755222 non-null int64
idnProceeding         755222 non-null int64
strAppealCategory     755222 non-null object
strAppealType         755222 non-null object
datAppealFiled        755060 non-null object
strFiledBy            755212 non-null object
datAttorneyE27        478717 non-null object
datBIADecision        755222 non-null object
strBIADecision        755222 non-null object
strBIADecisionType    731103 non-null object
strCaseType           755180 non-null object
strLang               752329 non-null object
strNat                754061 non-null object
strProceedingIHP      365943 non-null object
strCustody            643730 non-null object
strProbono            1842 non-null object
dtypes: int64(3), object(14)
memory usage: 103.7+ MB


In [266]:
# code lookup tables 
bia_appeal_category = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'), 
                                    sheetname='BIA Appeal Category', skip_footer=7)
bia_appeal_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                sheetname='BIA Appeal Type', skip_footer=3)
bia_decision_type = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision type', skip_footer=2)
bia_decision_code = pd.read_excel(os.path.join(DATAFOLDER, 'data/raw/BIA Appeal Data File code translations.xlsx'),
                                  sheetname='BIA decision code', skip_footer=2)

In [267]:
# join them 
tblAppeal_df = tblAppeal.merge(bia_appeal_category, how='left', left_on='strAppealCategory', right_on='Code')\
                        .rename(columns={'Description': 'strAppealCategoryDesc'}).drop('Code', axis=1)\
                        .merge(bia_appeal_type, how='left', left_on='strAppealType', right_on='Code')\
                        .rename(columns={'Description': 'strAppealTypeDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_code, how='left', left_on='strBIADecision', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_type, how='left', left_on='strBIADecisionType', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionTypeDesc'}).drop('Code', axis=1)
print(tblAppeal_df.info()) 
tblAppeal_df.sample(10).T

<class 'pandas.core.frame.DataFrame'>
Int64Index: 755222 entries, 0 to 755221
Data columns (total 21 columns):
idnAppeal                 755222 non-null int64
idncase                   755222 non-null int64
idnProceeding             755222 non-null int64
strAppealCategory         755222 non-null object
strAppealType             755222 non-null object
datAppealFiled            755060 non-null object
strFiledBy                755212 non-null object
datAttorneyE27            478717 non-null object
datBIADecision            755222 non-null object
strBIADecision            755222 non-null object
strBIADecisionType        731103 non-null object
strCaseType               755180 non-null object
strLang                   752329 non-null object
strNat                    754061 non-null object
strProceedingIHP          365943 non-null object
strCustody                643730 non-null object
strProbono                1842 non-null object
strAppealCategoryDesc     752952 non-null object
strAppealTyp

Unnamed: 0,81920,115830,150286,356147,589261,669562,205905,129893,584013,185872
idnAppeal,82052,116099,171914,4559209,4820369,4915148,4387641,150831,4814526,4364449
idncase,4346399,2198332,3402061,2988044,5812896,3909147,3399395,2086684,3815569,2712199
idnProceeding,2899168,302892,1705913,1321390,4311119,2316128,1702191,159911,2197267,968869
strAppealCategory,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ
strAppealType,Case Appeal,MTR BIA,Case Appeal,MTR BIA,Case Appeal,Case Appeal,Appeal of IJ MTR,Case Appeal,Case Appeal,Case Appeal
datAppealFiled,1995-04-05 00:00:00,1991-08-30 00:00:00,1996-07-03 00:00:00,2002-12-23 00:00:00,2008-03-19 00:00:00,2010-08-02 00:00:00,1998-06-15 00:00:00,1996-08-08 00:00:00,2008-01-28 00:00:00,1997-10-23 00:00:00
strFiledBy,A,A,A,A,A,A,A,A,I,A
datAttorneyE27,1995-07-10 00:00:00,,,2002-12-23 00:00:00,2008-03-19 00:00:00,2010-08-02 00:00:00,1998-06-15 00:00:00,1996-08-08 00:00:00,,1997-10-23 00:00:00
datBIADecision,2000-12-13 00:00:00,1994-10-12 00:00:00,1997-08-21 00:00:00,2003-01-28 00:00:00,2008-05-27 00:00:00,2012-01-11 00:00:00,2000-02-14 00:00:00,1997-09-11 00:00:00,2010-02-22 00:00:00,2000-07-26 00:00:00
strBIADecision,DIS,GRN,DIS,DEN,SAF,REM,DIS,DSO,BCR,REM


In [268]:
# some strBIADecision don't have corresponding code translations; to remove 
print(tblAppeal_df[pd.isnull(tblAppeal_df['strBIADecisionDesc'])]['strBIADecision'].value_counts()) 
tblAppeal_df.dropna(subset=['strBIADecisionDesc'], inplace=True)

DSO    5987
DED    2107
CPG    1704
ABC    1606
APD    1170
GRS     431
DNS     421
ADM     355
DMO     238
CPC      90
MB       88
SNC      82
AFD      54
RET      52
ADD      18
CGR      16
TPD      16
REV      13
WPD       5
SUP       4
          2
Name: strBIADecision, dtype: int64


### Master Proceedings

In [269]:
master = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/master.csv')) 
print(master.info())
master.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6084437 entries, 0 to 6084436
Data columns (total 16 columns):
idncase             float64
nat                 object
case_type           object
c_asy_type          object
idnproceeding       object
base_city_code      object
hearing_loc_code    object
dec_type            object
dec_code            object
other_comp          object
osc_date            object
input_date          object
comp_date           object
attorney_flag       float64
ij_code             object
tracid              float64
dtypes: float64(3), object(13)
memory usage: 742.7+ MB
None


Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
5459507,6726740.0,NS,RMV,E,5475424,WAS,WAD,O,X,,07MAR2011,01APR2011,16FEB2012,,RHC,313.0
4332256,5793791.0,MX,RMV,,4287520,HOD,HUN,W,X,,17FEB2007,28JUN2007,30OCT2007,,HR,288.0
2358215,3890454.0,MX,RMV,,2288396,LOS,SPD,7,X,,06AUG2003,08AUG2003,13AUG2003,,RCP,209.0
1063490,2858389.0,GT,DEP,,1160140,FLO,FLO,O,D,,13JAN1993,22JAN1993,10FEB1993,,JWR,177.0
2205798,3769264.0,HO,RMV,,2140926,ELP,EPD,7,X,,26OCT1997,03NOV1997,12NOV1997,,BAZ,184.0


In [270]:
# check missing data 
master.isnull().sum()

idncase                  14
nat                   16775
case_type                15
c_asy_type          4449366
idnproceeding             0
base_city_code           24
hearing_loc_code       1516
dec_type            1519549
dec_code            1674748
other_comp          4744870
osc_date              33353
input_date            29744
comp_date            335076
attorney_flag       3044591
ij_code               39598
tracid               341914
dtype: int64

In [271]:
# look up tables 
master_case_type = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/master_case_type.csv'))
master_decision_type = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/master_decision_type.csv'))

master_decision_on_proceeding = pd.read_csv(os.path.join(DATAFOLDER, 'data/raw/master_decision_on_proceeding.csv'))
master_decision_on_proceeding.drop(master_decision_on_proceeding.tail(2).index, inplace=True)
master_decision_on_proceeding.drop(['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], axis=1, inplace=True)

In [272]:
# add column to indicate whether decision was listed in dec_code or other_comp 
master['dec_col'] = np.where(pd.notnull(master['dec_code']), 'C', 'O') 
master['dec_judge'] = np.where(pd.notnull(master['dec_code']), master['dec_code'], master['other_comp']) 

In [273]:
# join to look up tables
master_df = master.merge(master_decision_type, how='left', left_on='dec_type', right_on='strCode')\
                  .rename(columns={'strDescription': 'dec_type_str'}).drop('strCode', axis=1)\
                  .merge(master_case_type, how='left', left_on='case_type', right_on='strCode')\
                  .rename(columns={'strDescription': 'case_type_str'}).drop('strCode', axis=1)\
                  .merge(master_decision_on_proceeding, how='left', left_on=['case_type', 'dec_judge', 'dec_col'], 
                         right_on=['strCaseType', 'strDecCode', 'strDecType'])\
                  .rename(columns={'strDecDescription': 'dec_code_str'}).drop(['strCaseType', 'strDecCode', 'strDecType'], axis=1)
master_df.sample(3)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,...,input_date,comp_date,attorney_flag,ij_code,tracid,dec_col,dec_judge,dec_type_str,case_type_str,dec_code_str
1594662,3277111.0,MX,RMV,,62314,ELO,EAZ,O,X,,...,16JUL1997,24JUL1997,,SHK,48.0,C,X,oral decision,Removal,Remove
1442569,3156832.0,ES,DEP,E,1526067,WAS,WAS,,,A,...,29APR1999,24FEB2000,1.0,JMB,1.0,O,A,,Deportation,Administrative Closing - Other
2687926,4123392.0,CO,RMV,I,2605607,ORL,ORL,O,X,,...,19JUL2001,03JUL2003,,ROS,170.0,C,X,oral decision,Removal,Remove


### Master Proceedings (processed by Sagent/Dunn)

In [274]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'data/AsylumAdj/data_for_model/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn['in_dunn'] = 1 
master_dunn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602500 entries, 0 to 602499
Columns: 183 entries, Unnamed: 0 to in_dunn
dtypes: bool(1), float64(72), int64(40), object(70)
memory usage: 837.2+ MB


# Merge Datasets (master, dunn, judge bio)

In [275]:
judge_bio = pd.read_csv(os.path.join(DATAFOLDER, 'data/AsylumAdj/data/cleaned_judge_bios.csv')) 
judge_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Male_judge               367 non-null float64
Year_Appointed_SLR       367 non-null float64
Year_College_SLR         367 non-null float64
Year_Law_school_SLR      367 non-null float64
Government_Years_SLR     367 non-null float64
Govt_nonINS_SLR          367 non-null float64
INS_Years_SLR            367 non-null float64
Military_Years_SLR       367 non-null float64
NGO_Years_SLR            367 non-null float64
Privateprac_Years_SLR    367 non-null float64
Academia_Years_SLR       367 non-null float64
ij_code                  367 non-null object
dtypes: float64(11), object(1)
memory usage: 34.5+ KB


In [389]:
df = master_dunn.merge(tblAppeal_df, how='left', 
                       left_on=['idncase', 'idnproceeding'], right_on=['idncase', 'idnProceeding'])
df = df.merge(judge_bio, how='left', on='ij_code')

In [390]:
# add/reformat a few variables 
df['judge_missing_bio'] = np.where(pd.isnull(df['Male_judge']), 1, 0)
df['ij_code_nat'] = df['ij_code'] + '_' + df['nat_string']

df['datAppealFiled_dt'] = pd.to_datetime(df['datAppealFiled'], errors='coerce')
df['datAppealFiled_year'] = df['datAppealFiled_dt'].dt.year
df['datAppealFiled_month'] = df['datAppealFiled_dt'].dt.month 
df['datAppealFiled_year_month'] = (df['datAppealFiled_year'] - 1970) + df['datAppealFiled_month']

df['datBIADecision_dt'] = pd.to_datetime(df['datBIADecision'], errors='coerce')
df['datBIADecision_year'] = df['datBIADecision_dt'].dt.year
df['datBIADecision_month'] = df['datBIADecision_dt'].dt.month 
df['datBIADecision_year_month'] = (df['datBIADecision_year'] - 1970) + df['datBIADecision_month']

In [391]:
print(df.columns.tolist())

['Unnamed: 0', u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', u'dec_type_string', u'_mdectype', u'outcome_recorded_in_field', u'dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'appl_dec', u'_mappln', u'application_type_string', u'_mapplcode', u'application_dec_string', u'_mappldec', u'cityid', u'judgeid', u'natid', u'comp_year', u'comp_month', u'comp_day', u'comp_date', u'osc_year', u'osc_month'

# Determine scope of appeal case types

In [392]:
df['appealed'] = np.where(pd.notnull(df['idnAppeal']), 1, 0) 
df['appealed'].value_counts() 

1    369933
0    332335
Name: appealed, dtype: int64

In [393]:
# check appeal and case types 
df.groupby(['strAppealTypeDesc', 'case_type_string']).size().unstack().fillna(0) 

case_type_string,ASYLUM ONLY CASE,DEPORTATION,EXCLUSION,REMOVAL,WITHHOLDING ONLY
strAppealTypeDesc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Appeal of IJ MTR,93.0,2653.0,478.0,3026.0,9.0
Bond Appeal,10.0,569.0,7.0,2388.0,6.0
Bond MTR Reconsider,0.0,0.0,0.0,12.0,0.0
Bond MTR Reinstate,0.0,0.0,0.0,4.0,0.0
Bond MTR Reopen,0.0,0.0,0.0,10.0,0.0
Case Appeal,3878.0,66912.0,18341.0,173865.0,842.0
Circuit Court Remand,186.0,604.0,145.0,7058.0,26.0
Interlocutory Appeal,7.0,68.0,41.0,486.0,7.0
MTR BIA,1231.0,21833.0,5739.0,59309.0,87.0


In [394]:
# filter for relevant appeal types 
selected_appeal_types = ['Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA'] 
df['appealed'] = np.where(df['strAppealTypeDesc'].isin(selected_appeal_types), df['appealed'], 0) 
df['appealed'].value_counts() 

1    366927
0    335341
Name: appealed, dtype: int64

In [395]:
# exclude government appeals (defined as original proceeding not granted)
df.rename(columns={'grant': 'original_granted'}, inplace=True)
df['appealed'] = np.where(df['original_granted'] == 0, df['appealed'], 0) 
df['appealed'].value_counts() 

1    355238
0    347030
Name: appealed, dtype: int64

In [396]:
# de-duplicate multiple appeals (each case-proceeding should be unique) by retaining the last appeal 
df = df.sort_values(by=['idncase', 'idnProceeding', 'datBIADecision_dt'], 
                    ascending=[True, True, False])
df.drop_duplicates(subset=['idncase', 'idnProceeding'], keep='first', inplace=True)
df['appealed'].value_counts() 

0    340595
1    258782
Name: appealed, dtype: int64

In [397]:
# determine appeal decision type 
positive_labels = ['Background Check Remand', 'Grant With No Remand', 'Granted', 'Remand', 
                   'Sustain', 'Temporary Protected Status', 'Termination']
negative_labels = ['Denied', "Dismiss Appeal/Affirm IJ's Decision", 'Dismissed (Grant V/D 30 days)', 
                   'Dismissed (Voluntary Departure Granted)', 'Rejection', 'SUMMARY AFFIRMANCE/VD', 
                   'Summary Affirmance', 'Summary Dismiss', 'Summary Dismissal (O) Other', 
                   'Summary Dismissal (a) inad reason on appeal', 'Summary Dismissal - Both (a) & (e)']

df['appeal_decision'] = np.where(df['appealed'] == 0, 'did_not_appeal', 
                                 np.where(df['strBIADecisionDesc'].isin(positive_labels), 'appeal_successful', 
                                          np.where(df['strBIADecisionDesc'].isin(negative_labels), 'appeal_denied', 
                                                   'appeal_neutral')))
df['appeal_decision'].value_counts()

did_not_appeal       340595
appeal_denied        163829
appeal_successful     78482
appeal_neutral        16471
Name: appeal_decision, dtype: int64

In [398]:
# exclude appeals with neutral decision outcomes 
df = df[df['appeal_decision'] != 'appeal_neutral'].copy()  
df['appealed'].value_counts() 

0    340595
1    242311
Name: appealed, dtype: int64

# Split into Appeals vs. Non-Appeals

In [399]:
# split into those who appealed vs. those who didn't 
appeals = df[df['appealed'] == 1].copy() 
non_appeals = df[df['appealed'] == 0].copy() 

In [400]:
# for those that appealed, drop those without mandatory features, and assign binary response label 
mandatory_features = ['idnAppeal', 'datAppealFiled_year', 'case_type_string'] 
appeals.dropna(subset=mandatory_features, inplace=True)
appeals['granted'] = np.where(appeals['appeal_decision'] == 'appeal_successful', 1, 0) 
appeals['granted'].astype(int)
print("{} of appeal were successful".format(float(appeals['granted'].sum()) / len(appeals)))

0.323922034822 of appeal were successful


# Additional Feature Engineering

#### Group nationalities and judges with few samples  

In [401]:
def get_feature_values_to_retain(df, feature_name, min_samples): 
    """ Returns a list of feature values that meet min_samples """
    distinct_values = df[feature_name].value_counts() 
    retain_values = distinct_values[distinct_values >= min_samples].index.tolist() 
    print("{} distinct values of {} will be retained as unique values, remaining {} will be grouped as other.".format(
        len(retain_values), feature_name, len(distinct_values) - len(retain_values)))
    return retain_values 

In [402]:
# apply to judges 
ij_code_to_retain = get_feature_values_to_retain(appeals, feature_name='ij_code', min_samples=50)
appeals['ij_code_grouped'] = np.where(appeals['ij_code'].isin(ij_code_to_retain), appeals['ij_code'], 'other')
non_appeals['ij_code_grouped'] = np.where(non_appeals['ij_code'].isin(ij_code_to_retain), non_appeals['ij_code'], 'other')

368 distinct values of ij_code will be retained as unique values, remaining 53 will be grouped as other.


In [403]:
# apply to nationalities 
nat_string_to_retain = get_feature_values_to_retain(appeals, feature_name='nat_string', min_samples=50)
appeals['nat_grouped'] = np.where(appeals['nat_string'].isin(ij_code_to_retain), appeals['nat_string'], 'other')
non_appeals['nat_grouped'] = np.where(non_appeals['nat_string'].isin(ij_code_to_retain), non_appeals['nat_string'], 'other')

128 distinct values of nat_string will be retained as unique values, remaining 89 will be grouped as other.


#### Judge Experience 

As proxied by two variables:  
- years_since_appointed = YEAR(Original proceeding decision) - YEAR(Judge Appointment) 
- years_since_law_school = YEAR(Original proceeding decision) - YEAR(Law School) 

In [404]:
def get_time_delta(df, feature_year_before, feature_year_after, default_value=-1): 
    """ Computes difference between feature_year_before and feature_year_after, 
        filling NaNs and negative values with -1 """
    time_delta = df[feature_year_after] - df[feature_year_before]
    time_delta = np.where((time_delta < 0) | pd.isnull(time_delta), default_value, time_delta)
    return time_delta

In [405]:
# years since judge appointment 
appeals['years_since_judge_appointment'] = get_time_delta(appeals, 'Year_Appointed_SLR', 'comp_year')
non_appeals['years_since_judge_appointment'] = get_time_delta(non_appeals, 'Year_Appointed_SLR', 'comp_year')

In [406]:
# years since law school 
appeals['years_since_law_school'] = get_time_delta(appeals, 'Year_Law_school_SLR', 'comp_year')
non_appeals['years_since_law_school'] = get_time_delta(non_appeals, 'Year_Law_school_SLR', 'comp_year')

#### Average Appeal Grant Rate from Preceeding N Appeals

In [427]:
def break_into_chunks(data, dimension, max_chunk): 
    """ Returns a dictionary of lists to instruct breaking up dataset into suitable chunks, 
        where resulting rows from self-join on ij_code does not exceed max_df_rows """
    dimensions = pd.DataFrame(data.groupby(dimension).size().sort_values(ascending=False)) 
    dimensions = dimensions.rename(columns={0: 'rows'}).reset_index() 
    dimensions['self_join'] = dimensions['rows'] ** 2
    
    # stop if dimension has too many rows exceeding max_chunk 
    exceeds_max_chunk = dimensions[dimensions['self_join'] > max_chunk]
    if len(exceeds_max_chunk) > 0: 
        print(exceeds_max_chunk)
        raise ValueError('Dimension has too many rows!') 
    else: pass 
    
    dimensions['self_join_cumulative'] = dimensions['self_join'].cumsum() 
    dimensions['chunk'] = np.floor(dimensions['self_join_cumulative'] / max_chunk).astype(int)
    chunk_assignments = dimensions.groupby('chunk')[dimension].apply(list).to_dict()
    print("Split {} labels in {} dimension into {} chunks...".format(len(dimensions), dimension, 
                                                                                  len(chunk_assignments)))
    return chunk_assignments

def compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n): 
    """ Run compute for a given chunk of data """
    df = data_chunk.merge(ref_chunk, how='left', on=dimension)
    results = df[df['datBIADecision_dt'] < df['datAppealFiled_dt']].groupby('idnAppeal').apply(
        lambda f: f.head(last_n)['granted'].mean()) 
    return results 

def compute_last_n_decisions(data, dimension, new_feature_name, max_chunk=50000000, last_n=10): 
    """ Unified method to compute last n decisions """
    
    # get chunk assignments 
    chunk_assignments = break_into_chunks(data, dimension, max_chunk)
    
    # initialize empty list 
    results = [] 
    start = time.time() 
    
    # loop through each chunk 
    for chunk, selected in chunk_assignments.iteritems(): 
        start_chunk = time.time() 
        data_variables = ['idnAppeal', 'datAppealFiled_dt'] + [dimension]
        ref_variables = ['datBIADecision_dt', 'granted'] + [dimension] 
        data_chunk = data[data[dimension].isin(selected)][data_variables]
        ref_chunk = data[data[dimension].isin(selected)][ref_variables].sort_values(
            by=[dimension] + ['datBIADecision_dt'], ascending=[True, False])  
        result = compute_last_n_decisions_by_chunk(data_chunk, ref_chunk, dimension, last_n)
        results.append(result)
        print("Chunk {} completed in {} seconds".format(chunk, time.time() - start_chunk))
        
    print("DONE: Last {} decisions computed for {} dimension in {} seconds".format(last_n, dimension, 
                                                                             time.time() - start))
    
    return pd.DataFrame(pd.concat(results), columns=[new_feature_name])  

In [408]:
appeals_subset = appeals[['idnAppeal', 'datAppealFiled_dt', 'datBIADecision_dt', 'granted', 
                          'ij_code', 'nat_string', 'ij_code_nat']].copy()

In [409]:
last_10_appeal_grant_by_judge = compute_last_n_decisions(appeals_subset, 'ij_code', 'last_10_appeal_grant_by_judge', 
                                                         max_chunk=50000000, last_n=10)
appeals = appeals.merge(last_10_appeal_grant_by_judge, how='left', left_on='idnAppeal', right_index=True)

Split 421 labels in ij_code dimension into 7 chunks...
Chunk 0 completed in 12.7054569721 seconds
Chunk 1 completed in 16.6242520809 seconds
Chunk 2 completed in 20.1490130424 seconds
Chunk 3 completed in 18.7814161777 seconds
Chunk 4 completed in 19.2745049 seconds
Chunk 5 completed in 27.7283520699 seconds
Chunk 6 completed in 7.37954998016 seconds
DONE: Last 10 decisions computed for ij_code dimension in 122.643368006 seconds


In [429]:
last_10_appeal_grant_by_judge_nat = compute_last_n_decisions(appeals_subset, 'ij_code_nat', 
                                                             'last_10_appeal_grant_by_judge_nat', 
                                                             max_chunk=50000000, last_n=10)
appeals = appeals.merge(last_10_appeal_grant_by_judge_nat, how='left', left_on='idnAppeal', right_index=True)

Split 19026 labels in ij_code_nat dimension into 2 chunks...
Chunk 0 completed in 27.8502919674 seconds
Chunk 1 completed in 36.710722208 seconds
DONE: Last 10 decisions computed for ij_code_nat dimension in 64.5612590313 seconds


# Output to CSV 

In [431]:
print(appeals.columns.tolist())

['Unnamed: 0', u'idncase', u'idnproceeding', u'nat', u'case_type', u'c_asy_type', u'base_city_code', u'hearing_loc_code', u'dec_type', u'dec_code', u'other_comp', u'attorney_flag', u'ij_code', u'tracid', u'case_type_string', u'_mcase', u'dec_type_string', u'_mdectype', u'outcome_recorded_in_field', u'dec_string', u'_mdecproceeddec', u'_mdecproceedoth', u'nat_string', u'_mnat', u'base_city_street', u'base_city_string', u'base_city_state', u'base_city_zip5', u'base_city_zip4', u'base_city_phone', u'_mbasecity', u'hearing_loc_string1', u'hearing_loc_string2', u'hearing_loc_street', u'hearing_loc_city', u'hearing_loc_state', u'hearing_loc_zip5', u'hearing_loc_phone', u'_mhearingloc', u'judge_name_caps', u'_mlookupijcode', u'idncode', u'idnproceedingappln', u'appl_code', u'appl_dec', u'_mappln', u'application_type_string', u'_mapplcode', u'application_dec_string', u'_mappldec', u'cityid', u'judgeid', u'natid', u'comp_year', u'comp_month', u'comp_day', u'comp_date', u'osc_year', u'osc_month'

In [432]:
# subset variables to keep 
variables_to_keep = ['idncase', 'idnproceeding', 'idnAppeal', 'granted', 'nat_string', 'ij_code', 'attorney_flag',
                     'lawyer', 'defensive', 'affirmative', 'oral', 'written', 'base_city_code', 'base_city_state', 
                     'hearing_loc_city', 'hearing_loc_code', 'hearing_loc_state', 'venue_change', 
                     'comp_year', 'comp_month', 'comp_day', 
                     'osc_year', 'osc_month', 'osc_day', 'input_year', 'input_month', 'input_day',
                     'adj_time_start', 'case_type_string', 'dec_type_string', 'dec_string',
                     'deport', 'relief_granted', 'remove', 'terminated', 'voluntary_departure', 
                     'deport_form', 'voluntary_form', 'deportation_proceeding', 
                     'exclusion_proceeding', 'removal_proceeding', 'asylum_only_proceeding', 
                     'withholding_only_proceeding', 'strCustody', 'strProbono', 
                     'datAppealFiled_year', 'datAppealFiled_month', 'datAppealFiled_year_month', 
                     'datBIADecision_dt', 'datBIADecision_year', 'datBIADecision_month', 
                     'datBIADecision_year_month', 'ij_code_grouped', 'nat_grouped', 
                     'years_since_judge_appointment', 'years_since_law_school', 
                     'last_10_appeal_grant_by_judge', 'last_10_appeal_grant_by_judge_nat'] 

In [433]:
# output to csv 
appeals_fp = os.path.join(DATAFOLDER, 'data_for_model/appeals_data_2018-05-13-v2.csv') 
appeals[variables_to_keep + ['granted']].to_csv(appeals_fp, encoding='utf-8', index=False)
appeals.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242262 entries, 498640 to 702259
Columns: 233 entries, Unnamed: 0 to last_10_appeal_grant_by_judge_nat
dtypes: bool(1), datetime64[ns](2), float64(95), int64(43), object(92)
memory usage: 430.9+ MB


In [434]:
# MAYBE TODO: output non-appeals for prediction as well 

# Ignore Below

In [435]:
# helper methods for Bayesian adjustment 

# def calibrate_beta_priors(prior_mean, effective_sample_size): 
#     """
#     Takes aggregate rate and return Beta priors (alpha, beta) with prior mean approximating aggregate rate
#     with respect to effective sample size chosen 
#     """
    
#     rounded_rate = np.round(prior_mean, 1)
#     alpha = int(rounded_rate * effective_sample_size) 
#     beta = effective_sample_size - alpha 
    
#     return alpha, beta

# def compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total): 
#     """ 
#     Takes Beta priors (alpha, beta) along with observed data (num_total, num_positives) 
#     and returns posterior mean 
#     """
    
#     updated_alpha = alpha_prior + num_positives 
#     updated_beta = beta_prior + num_total - num_positives 
    
#     posterior_mean = float(updated_alpha) / (updated_alpha + updated_beta)
    
#     return posterior_mean 

# def compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total): 
#     """ 
#     Takes Beta priors (alpha, beta) along with observed data (num_total, num_positives) 
#     and returns posterior mean 
#     """
    
#     updated_alpha = alpha_prior + num_positives 
#     updated_beta = beta_prior + num_total - num_positives 
    
#     posterior_mean = float(updated_alpha) / (updated_alpha + updated_beta)
    
#     return posterior_mean

# def get_beta_adj_rate(prior_mean, num_positives, num_total, prior_effective_size=10): 
#     """ 
#     Takes aggregate mean as a float (from 0 to 1), num_total (integer), and num_positives (integer) 
#     and return the 'Beta-adjusted' rate. 
#     Example: if in total 30% of Chinese nationality cases were granted, and a specific judge saw 20 cases 
#     and granted 14 of them, input aggregate_mean=0.3, num_total=20, and num_positives=14 
#     """
    
#     if type(prior_mean) is not float: 
#         raise ValueError("Please enter a float for prior mean!")
        
#     if prior_mean < 0 or prior_mean > 1: 
#         raise ValueError("Prior mean must be between 0 and 1!")
            
#     alpha_prior, beta_prior = calibrate_beta_priors(prior_mean, prior_effective_size)
#     posterior_mean = compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total)
    
#     return posterior_mean

In [436]:
# get_beta_adj_rate(prior_mean=.3, num_positives=14, num_total=20, prior_effective_size=0)

In [437]:
# def last_year_grant_rate_by_feature(data, feature, last_year_grant_rate, prior_effective_size=10):
#     df = data.groupby([feature, 'datBIADecision_dt_year', 'granted'])\
#              .size().unstack().fillna(0).reset_index()\
#              .rename(columns={0:'not_granted', 1:'granted'})
#     df = df.merge(last_year_grant_rate, how='left', left_on='datBIADecision_dt_year', right_index=True)\
#            .rename(columns={'datBIADecision_dt_year': 'datBIADecision_dt_year_'}) 
#     new_feature_name = 'last_year_adj_grant_rate_by_{}'.format(feature)
#     df[new_feature_name] = df.apply(
#         lambda r: get_beta_adj_rate(r['last_year_grant_rate'], r['granted'], 
#                                     r['granted'] + r['not_granted'], prior_effective_size), axis=1)
#     results = data.merge(df[['datBIADecision_dt_year_', feature, new_feature_name]], how='left', 
#                          left_on = ['datAppealFiled_dt_year_before', feature],
#                          right_on = ['datBIADecision_dt_year_', feature])\
#                   .drop('datBIADecision_dt_year_', axis=1)  
                    
#     return results 

In [438]:
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'ij_code_grouped', last_year_grant_rate,
#                                                          prior_effective_size=10)

In [439]:
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'nat_string_grouped', last_year_grant_rate,
#                                                          prior_effective_size=10)

In [203]:
# appeals_with_last_year['ij_code_nat'] = appeals_with_last_year['ij_code_grouped'] + '_' + appeals_with_last_year['nat_string_grouped']
# appeals_with_last_year = last_year_grant_rate_by_feature(appeals_with_last_year, 'ij_code_nat', last_year_grant_rate,
#                                                          prior_effective_size=10)