In [1]:
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
pd.set_option('display.max_rows', 200)
%matplotlib inline

# Import Data 

In [2]:
# define folder where data resides 
DATAFOLDER = "/Users/snuffles753/Documents/NYU-GSAS/ds1003/term-project/data"

### Appeals

In [3]:
# main table 
tblAppeal = pd.read_csv(os.path.join(DATAFOLDER, 'raw/tblAppeal.csv'), low_memory=False) 
print(tblAppeal.info())
tblAppeal.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870388 entries, 0 to 870387
Data columns (total 17 columns):
idnAppeal             870388 non-null int64
idncase               868758 non-null float64
idnProceeding         776380 non-null float64
strAppealCategory     870388 non-null object
strAppealType         870388 non-null object
datAppealFiled        870226 non-null object
strFiledBy            870378 non-null object
datAttorneyE27        521008 non-null object
datBIADecision        847196 non-null object
strBIADecision        847180 non-null object
strBIADecisionType    822882 non-null object
strCaseType           824855 non-null object
strLang               773811 non-null object
strNat                777004 non-null object
strProceedingIHP      367364 non-null object
strCustody            666356 non-null object
strProbono            1880 non-null object
dtypes: float64(2), int64(1), object(14)
memory usage: 112.9+ MB
None


Unnamed: 0,idnAppeal,idncase,idnProceeding,strAppealCategory,strAppealType,datAppealFiled,strFiledBy,datAttorneyE27,datBIADecision,strBIADecision,strBIADecisionType,strCaseType,strLang,strNat,strProceedingIHP,strCustody,strProbono
755592,4929780,6619491.0,5329854.0,IJ,Case Appeal,2010-12-20 00:00:00,A,,2011-04-07 00:00:00,DIS,R,RMV,ENG,TD,,D,
633550,4806524,5446775.0,3849450.0,IJ,Case Appeal,2007-11-09 00:00:00,A,2007-11-09 00:00:00,2008-10-28 00:00:00,DIS,R,RMV,ENG,GY,,N,
851686,5026732,6998155.0,5918959.0,IJ,Case Appeal,2013-08-14 00:00:00,A,2014-01-24 00:00:00,,,,RMV,SP,MX,,R,
376757,4541621,3684725.0,2038727.0,IJ,Case Appeal,2002-08-16 00:00:00,A,2002-08-23 00:00:00,2004-04-01 00:00:00,SAV,P,RMV,SP,MX,,N,
300827,4461804,2070088.0,130773.0,IJ,Case Appeal,2000-06-20 00:00:00,A,2000-06-20 00:00:00,2001-08-30 00:00:00,REM,P,RMV,SP,DR,,N,


In [4]:
# drop appeals with no case number, proceeding number, or decision 
tblAppeal.dropna(subset=['idncase', 'idnProceeding', 'strBIADecision'], inplace=True) 
tblAppeal = tblAppeal[tblAppeal['idnProceeding'] != 0] # drop zeros 

# convert indexes to integers 
tblAppeal['idncase'] = tblAppeal['idncase'].astype(int) 
tblAppeal['idnProceeding'] = tblAppeal['idnProceeding'].astype(int) 
tblAppeal.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 755222 entries, 0 to 869457
Data columns (total 17 columns):
idnAppeal             755222 non-null int64
idncase               755222 non-null int64
idnProceeding         755222 non-null int64
strAppealCategory     755222 non-null object
strAppealType         755222 non-null object
datAppealFiled        755060 non-null object
strFiledBy            755212 non-null object
datAttorneyE27        478717 non-null object
datBIADecision        755222 non-null object
strBIADecision        755222 non-null object
strBIADecisionType    731103 non-null object
strCaseType           755180 non-null object
strLang               752329 non-null object
strNat                754061 non-null object
strProceedingIHP      365943 non-null object
strCustody            643730 non-null object
strProbono            1842 non-null object
dtypes: int64(3), object(14)
memory usage: 103.7+ MB


In [5]:
# format column types 
tblAppeal['datAppealFiled_dt'] = pd.to_datetime(tblAppeal['datAppealFiled'], errors='coerce')
tblAppeal['datBIADecision_dt'] = pd.to_datetime(tblAppeal['datBIADecision'], errors='coerce')
tblAppeal['datAppealFiled_year'] = tblAppeal['datAppealFiled_dt'].dt.year

In [6]:
# code lookup tables 
bia_appeal_category = pd.read_excel(os.path.join(DATAFOLDER, 'raw/BIA Appeal Data File code translations.xlsx'), 
                                    sheet_name='BIA Appeal Category', skip_footer=7)
bia_appeal_type = pd.read_excel(os.path.join(DATAFOLDER, 'raw/BIA Appeal Data File code translations.xlsx'),
                                sheet_name='BIA Appeal Type', skip_footer=3)
bia_decision_type = pd.read_excel(os.path.join(DATAFOLDER, 'raw/BIA Appeal Data File code translations.xlsx'),
                                  sheet_name='BIA decision type', skip_footer=2)
bia_decision_code = pd.read_excel(os.path.join(DATAFOLDER, 'raw/BIA Appeal Data File code translations.xlsx'),
                                  sheet_name='BIA decision code', skip_footer=2)

In [7]:
# join them 
tblAppeal_df = tblAppeal.merge(bia_appeal_category, how='left', left_on='strAppealCategory', right_on='Code')\
                        .rename(columns={'Description': 'strAppealCategoryDesc'}).drop('Code', axis=1)\
                        .merge(bia_appeal_type, how='left', left_on='strAppealType', right_on='Code')\
                        .rename(columns={'Description': 'strAppealTypeDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_code, how='left', left_on='strBIADecision', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionDesc'}).drop('Code', axis=1)\
                        .merge(bia_decision_type, how='left', left_on='strBIADecisionType', right_on='Code')\
                        .rename(columns={'Description': 'strBIADecisionTypeDesc'}).drop('Code', axis=1)
print(tblAppeal_df.info()) 
tblAppeal_df.sample(10).T

<class 'pandas.core.frame.DataFrame'>
Int64Index: 755222 entries, 0 to 755221
Data columns (total 24 columns):
idnAppeal                 755222 non-null int64
idncase                   755222 non-null int64
idnProceeding             755222 non-null int64
strAppealCategory         755222 non-null object
strAppealType             755222 non-null object
datAppealFiled            755060 non-null object
strFiledBy                755212 non-null object
datAttorneyE27            478717 non-null object
datBIADecision            755222 non-null object
strBIADecision            755222 non-null object
strBIADecisionType        731103 non-null object
strCaseType               755180 non-null object
strLang                   752329 non-null object
strNat                    754061 non-null object
strProceedingIHP          365943 non-null object
strCustody                643730 non-null object
strProbono                1842 non-null object
datAppealFiled_dt         755060 non-null datetime64[ns]
datB

Unnamed: 0,175472,189305,546519,425731,370956,121696,153888,599889,307097,245452
idnAppeal,4351629,4368656,4772306,4634353,4575124,142278,176630,4832069,4505799,4433087
idncase,2728804,3346811,3160459,4204889,2460851,3391137,2230263,4253340,2741719,3654949
idnProceeding,992706,1475910,1530611,2722818,639709,1692213,347018,2783482,1011063,2003316
strAppealCategory,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ,IJ
strAppealType,Case Appeal,Case Appeal,Circuit Court Remand,Case Appeal,Appeal of IJ MTR,Case Appeal,MTR BIA,MTR BIA,MTR BIA,Case Appeal
datAppealFiled,1997-06-20 00:00:00,1997-12-03 00:00:00,2007-02-01 08:21:06.170000000,2004-05-20 00:00:00,2003-04-24 00:00:00,1996-06-06 00:00:00,1996-09-30 00:00:00,2008-06-24 00:00:00,2001-10-15 00:00:00,1999-09-16 00:00:00
strFiledBy,A,A,O,A,A,A,A,B,A,A
datAttorneyE27,1997-06-20 00:00:00,2001-01-30 00:00:00,2007-03-09 00:00:00,2004-05-20 00:00:00,2004-01-28 00:00:00,1996-06-06 00:00:00,1996-09-30 00:00:00,2008-06-24 00:00:00,,1999-09-16 00:00:00
datBIADecision,1998-11-09 00:00:00,2002-05-28 00:00:00,2007-06-26 00:00:00,2005-08-24 00:00:00,2004-05-27 00:00:00,1997-05-30 00:00:00,1997-03-26 00:00:00,2008-08-27 00:00:00,2001-11-26 00:00:00,2002-04-08 00:00:00
strBIADecision,DSO,DIS,REM,SAF,SAF,D30,DEN,REM,DEN,SAV


In [8]:
# some strBIADecision don't have corresponding code translations; to remove 
print(tblAppeal_df[pd.isnull(tblAppeal_df['strBIADecisionDesc'])]['strBIADecision'].value_counts()) 
tblAppeal_df.dropna(subset=['strBIADecisionDesc'], inplace=True)
tblAppeal_df.info()

DIS    212374
REM    108917
DEN     97799
SAF     59244
D30     47006
WDL     28587
DVD     28381
SAV     27481
OTH     19796
GRN     17526
TPS     16235
NJU     15166
SUS     14421
TER      9831
SUD      9468
BCR      8522
DSO      5987
CON      5695
DUT      5160
MBD      3302
DED      2107
GNR      1850
CPG      1704
ABC      1606
REJ      1319
APD      1170
SED      1072
SND       723
SAD       523
GRS       431
DNS       421
ADM       355
OTS       260
DMO       238
CPC        90
MB         88
SNC        82
SOD        60
AFD        54
RET        52
ADR        45
ADD        18
TPD        16
CGR        16
REV        13
WPD         5
SUP         4
            2
Name: strBIADecision, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 24 columns):
idnAppeal                 0 non-null int64
idncase                   0 non-null int64
idnProceeding             0 non-null int64
strAppealCategory         0 non-null object
strAppealType             0

### Master Proceedings

In [9]:
master = pd.read_csv(os.path.join(DATAFOLDER, 'raw/master.csv')) 
print(master.info())
master.sample(5)

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6084437 entries, 0 to 6084436
Data columns (total 16 columns):
idncase             float64
nat                 object
case_type           object
c_asy_type          object
idnproceeding       object
base_city_code      object
hearing_loc_code    object
dec_type            object
dec_code            object
other_comp          object
osc_date            object
input_date          object
comp_date           object
attorney_flag       float64
ij_code             object
tracid              float64
dtypes: float64(3), object(13)
memory usage: 742.7+ MB
None


Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
3630089,5191154.0,HO,RMV,,3534609,HOU,HOU,W,X,,10APR2005,22APR2005,09JUN2005,,WKZ,71.0
2901931,4273828.0,CH,RMV,E,2807588,SFR,SFX,,,C,24SEP2002,05DEC2002,30DEC2002,1.0,PG,264.0
2148894,3723623.0,PL,RMV,I,2086317,NYC,NYC,O,X,,19APR1999,27APR1999,04JAN2000,1.0,BAN,152.0
355091,2324279.0,JM,EXC,,466852,OAK,BOP,O,T,,25NOV1996,19DEC1996,10FEB1997,,SHK,48.0
554111,2479829.0,AF,EXC,E,662810,LOS,LOS,O,O,,09JAN1990,16JAN1990,16MAR1990,1.0,LRM,


In [10]:
# check missing data 
master.isnull().sum()

idncase                  14
nat                   16775
case_type                15
c_asy_type          4449366
idnproceeding             0
base_city_code           24
hearing_loc_code       1516
dec_type            1519549
dec_code            1674748
other_comp          4744870
osc_date              33353
input_date            29744
comp_date            335076
attorney_flag       3044591
ij_code               39598
tracid               341914
dtype: int64

In [11]:
# look up tables 
master_case_type = pd.read_csv(os.path.join(DATAFOLDER, 'raw/master_case_type.csv'))
master_decision_type = pd.read_csv(os.path.join(DATAFOLDER, 'raw/master_decision_type.csv'))
master_decision_on_proceeding = pd.read_csv(os.path.join(DATAFOLDER, 'raw/master_decision_on_proceeding.csv'), 
                                            skipfooter=2, engine='python')
master_decision_on_proceeding.drop(['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], axis=1, inplace=True)

In [12]:
# add column to indicate whether decision was listed in dec_code or other_comp 
master['dec_col'] = np.where(pd.notnull(master['dec_code']), 'C', 'O') 
master['dec_judge'] = np.where(pd.notnull(master['dec_code']), master['dec_code'], master['other_comp']) 

In [13]:
# join to look up tables
master_df = master.merge(master_decision_type, how='left', left_on='dec_type', right_on='strCode')\
                  .rename(columns={'strDescription': 'dec_type_str'}).drop('strCode', axis=1)\
                  .merge(master_case_type, how='left', left_on='case_type', right_on='strCode')\
                  .rename(columns={'strDescription': 'case_type_str'}).drop('strCode', axis=1)\
                  .merge(master_decision_on_proceeding, how='left', left_on=['case_type', 'dec_judge', 'dec_col'], 
                         right_on=['strCaseType', 'strDecCode', 'strDecType'])\
                  .rename(columns={'strDecDescription': 'dec_code_str'}).drop(['strCaseType', 'strDecCode', 'strDecType'], axis=1)
master_df.sample(5)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,...,input_date,comp_date,attorney_flag,ij_code,tracid,dec_col,dec_judge,dec_type_str,case_type_str,dec_code_str
5932077,7094963.0,HO,RMV,,6145003,WAS,WAS,,,,...,18APR2013,,,WI,3.0,O,,,Removal,
970008,2785538.0,RP,DEP,,1069379,LOS,LOS,O,D,,...,19MAR1993,25JUL1994,1.0,TF,91.0,C,D,oral decision,Deportation,Deport
1647366,3318726.0,CH,RMV,I,12164,NYC,NYC,W,X,,...,22SEP1998,19MAY1999,1.0,SH,146.0,C,X,written decision,Removal,Remove
1016698,2821450.0,MX,RMV,,1114095,SFR,SFR,O,X,,...,13DEC2002,10MAR2004,1.0,PAW,206.0,C,X,oral decision,Removal,Remove
5954348,7112129.0,MX,RMV,,6024410,DAL,OKT,O,X,,...,19NOV2012,04FEB2013,1.0,RWK,401.0,C,X,oral decision,Removal,Remove


### Master Proceedings (processed data from Daniel Chen)

In [None]:
import sys

reader = pd.read_stata(os.path.join(DATAFOLDER,'decision_sched_merge_adjdate.dta'), iterator=True)
df = pd.DataFrame()
chunk_size = 100*1000
try:
    chunk = reader.get_chunk(chunk_size)
    with open(os.path.join(DATAFOLDER,'decision_sched_merge_adjdate.csv'), 'a') as f:
        first = True
        while len(chunk) > 0:
            if first:
                chunk.to_csv(f, header=True)
                first = False
            else:
                chunk.to_csv(f, header=False)
#             df = df.append(chunk, ignore_index=True)
            chunk = reader.get_chunk(chunk_size)
            print('.')
            sys.stdout.flush()
except (StopIteration, KeyboardInterrupt):
    pass

master_dchen.info()

In [None]:
master_dchen = pd.read_csv(os.path.join(DATAFOLDER,'decision_sched_merge_adjdate.csv'), 
                           low_memory=False) 
master_dchen.info()

In [None]:
# check for nulls 
master_dchen.isnull().sum()

In [None]:
master_dchen.sample(3).T

In [None]:
# add column to indicate whether decision was listed in dec_code or other_comp 
master_dchen['dec_col'] = np.where(pd.notnull(master_dchen['dec_code']), 'C', 'O') 
master_dchen['dec_judge'] = np.where(pd.notnull(master_dchen['dec_code']), master_dchen['dec_code'], master_dchen['other_comp']) 

In [None]:
# join to look up tables
master_dchen_df = master_dchen.merge(master_decision_type, how='left', left_on='dec_type', right_on='strCode')\
                              .rename(columns={'strDescription': 'dec_type_str'}).drop('strCode', axis=1)\
                              .merge(master_case_type, how='left', left_on='case_type', right_on='strCode')\
                              .rename(columns={'strDescription': 'case_type_str'}).drop('strCode', axis=1)\
                              .merge(master_decision_on_proceeding, how='left', 
                                     left_on=['case_type', 'dec_judge', 'dec_col'], 
                                     right_on=['strCaseType', 'strDecCode', 'strDecType'])\
                              .rename(columns={'strDecDescription': 'dec_code_str'}).drop(['strCaseType', 'strDecCode', 'strDecType'], axis=1)
master_dchen_df.sample(3).T

### Master Proceedings (processed by Sagent/Dunn)

In [None]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'AsylumAdj/data_for_model/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn.info()

In [None]:
# dunn has far fewer proceedings than the original 
print "Original master has {} data points, Dunn's has {}".format(len(master_dunn), len(master))

In [None]:
print master_dchen.columns.tolist()

In [None]:
master_dchen['asylum_only_proceeding'].value_counts()

In [None]:
master_dunn['case_type_string'].value_counts()

In [None]:
master_dunn['asylum'].value_counts()

In [None]:
# determine what % of master proceedings that Dunn retained 
master_dunn_pairs = master_dunn[['idncase', 'idnproceeding']].copy()
master_dunn_pairs['in_dunn'] = 1 
master_dunn_delta = master_df[['idncase', 'idnproceeding', 'case_type_str']].merge(master_dunn_pairs, how='left', 
                                                                                   on=['idncase', 'idnproceeding']) 
master_dunn_delta['in_dunn'].fillna(0, inplace=True)
master_dunn_delta.groupby(['case_type_str'])['in_dunn'].agg(['size', np.mean])

# Determine scope of appeal case types

In [None]:
appeals_in_scope = tblAppeal_df.merge(master_dunn_pairs, how='left', 
                                      left_on=['idncase', 'idnProceeding'], right_on=['idncase', 'idnproceeding']) 
appeals_in_scope = appeals_in_scope.merge(master_dchen[['idncase', 'idnproceeding', 'case_type_string']], 
                                          how='left', left_on=['idncase', 'idnProceeding'], 
                                          right_on=['idncase', 'idnproceeding']) 
appeals_in_scope['in_dunn'].fillna(0, inplace=True)
appeals_in_scope['case_type_string'].fillna('NOT_MATCHED_TO_MASTER', inplace=True)

In [None]:
# distribution of appeal vs. proceeding types - all
appeals_in_scope.groupby(['strAppealTypeDesc', 'case_type_string']).size().unstack().fillna(0) 

In [None]:
# distribution of appeal vs. proceeding types - what dunn included 
appeals_in_scope[appeals_in_scope['in_dunn'] == 1].groupby(['strAppealTypeDesc', 'case_type_string']).size().unstack().fillna(0) 

#### Final Criteria: appeal types in ['Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA'] and is found in Dunn's 

In [None]:
# filter relevant appeal types 
selected_appeal_types = ['Appeal of IJ MTR', 'Case Appeal', 'Circuit Court Remand', 'Interlocutory Appeal', 'MTR BIA'] 
appeals_in_scope = appeals_in_scope[appeals_in_scope['strAppealTypeDesc'].isin(selected_appeal_types) & 
                                    appeals_in_scope['in_dunn'] == 1].copy() 
appeals_in_scope.info()

# Assign Labels 

In [None]:
# many different labels 
appeals_in_scope.groupby(['strBIADecisionDesc', 'strAppealTypeDesc']).size().unstack().fillna(0)

In [None]:
# assign positive and negative labels
granted_decisions = ['Background Check Remand', 'Grant With No Remand', 'Granted', 'Remand', 
                     'Sustain', 'Temporary Protected Status']
denied_decisions = ['Denied', "Dismiss Appeal/Affirm IJ's Decision", 'Dismissed (Grant V/D 30 days)', 
                    'Dismissed (Voluntary Departure Granted)', 'Rejection', 'SUMMARY AFFIRMANCE/VD', 
                    'Summary Affirmance', 'Summary Dismiss', 'Summary Dismissal (O) Other', 
                    'Summary Dismissal (a) inad reason on appeal', 'Summary Dismissal - Both (a) & (e)']
appeals_in_scope['granted'] = np.where(appeals_in_scope['strBIADecisionDesc'].isin(granted_decisions), 1, 
                                       np.where(appeals_in_scope['strBIADecisionDesc'].isin(denied_decisions), 0, None))

# drop neutral labels where appeals were dismissed due to administrative/procedural failures 
appeals_in_scope.dropna(subset=['granted'], inplace=True)
appeals_in_scope['granted'] = appeals_in_scope['granted'].astype(int)
appeals_in_scope.info()

# Dedup for unique (idncase, idnproceeding)

~24% of remaining proceedings have multiple appeals 

In [None]:
appeals_per_proceeding = appeals_in_scope.groupby(['idnProceeding'])['idnAppeal'].nunique()
appeals_per_proceeding.value_counts(normalize=True)

Most multiple appeals seem to involve 'MTR' after initial 'Case Appeal'. Should we use the last instance of 'MTR' as the final decision?

In [None]:
# examples of proceedings with multiple appeals 
sample_index = random.sample(appeals_per_proceeding[appeals_per_proceeding > 1].index, 1)
tblAppeal_df[tblAppeal_df['idnProceeding'].isin(sample_index)].T 

In [None]:
# check original proceedings for a given case 
master_dchen_df[master_dchen_df['idncase'] == 2241734].T

~2% of cases have multiple proceedings 

In [None]:
proceedings_per_case = appeals_in_scope.groupby(['idncase'])['idnProceeding'].nunique()
proceedings_per_case.value_counts(normalize=True)

In [None]:
# examples of cases with multiple proceedings 
sample_index = random.sample(proceedings_per_case[proceedings_per_case > 1].index, 1)
appeals_in_scope[appeals_in_scope['idncase'].isin(sample_index)].sort_values(by='datAppealFiled_dt').T 

In [None]:
# check original proceedings for a given case 
master_df[master_df['idncase'] == 5612569].T

** For each unique (case,proceeding) pairs with more than 1 appeal, pick the last appeal and dedup the rest ** 

In [None]:
appeals_deduped = appeals_in_scope.sort_values(by=['idncase', 'idnProceeding', 'datBIADecision_dt'], 
                                               ascending=[True, True, False])
appeals_deduped.drop_duplicates(subset=['idncase', 'idnProceeding'], keep='first', inplace=True)
appeals_deduped.info()

**31% of appeals were granted**

In [None]:
# check % granted in final dataset 
appeals_deduped['granted'].value_counts()

# Merge Datasets

#### Add variables from master-chen

In [None]:
# variables in processed data by Prof. Chen
print(master_dchen.columns.tolist())

In [None]:
master_dchen.groupby(['grantordeny_chicago']).size()

In [None]:
master_dchen.isnull().sum()

In [None]:
master_dchen.sample(4).T

In [None]:
dchen_selected_features = ['idncase', 'idnproceeding', 'nat_string', 'ij_code', 'attorney_flag',
                           'lawyer', 'defensive', 'affirmative', 
                           'oral', 'written', 'base_city_code', 'base_city_state', 
                           'hearing_loc_city', 'hearing_loc_code', 'hearing_loc_state', 'venue_change', 
                           'comp_year', 'comp_month', 'comp_day', 
                           'osc_year', 'osc_month', 'osc_day', 'input_year', 'input_month', 'input_day',
                           'adj_time_start', 'flag_earlystarttime', 
                           'flag_mismatch_base_city', 'flag_mismatch_hearing', 'flag_datemismatch',
                           'case_type_string', 'dec_type_string', 'dec_string', 
                           'deport', 'relief_granted', 'remove', 'terminated', 'voluntary_departure', 
                           'deport_form', 'voluntary_form', 'deportation_proceeding', 
                           'exclusion_proceeding', 'removal_proceeding', 'asylum_only_proceeding', 
                           'withholding_only_proceeding'] 

In [None]:
# merge with master proceedings 
appeals_with_master = appeals_deduped.merge(master_dchen[dchen_selected_features], how='left', 
                                            left_on=['idncase', 'idnProceeding'], right_on=['idncase', 'idnproceeding'])\
                                     .drop(['idnproceeding_y', 'case_type_string_y'], axis=1)\
                                     .rename(columns={'idnproceeding_x': 'idnproceeding', 
                                                      'case_type_string_x': 'case_type_string'})
appeals_with_master.info()

In [None]:
appeals_with_master.groupby(['case_type_string']).size()

#### Add hearing sessions data

In [None]:
# TODO

#### Add judge bios data 

In [None]:
# TODO 

#### Output dataset for model training

In [None]:
# define features to output for training 
output_features = ['idnAppeal', 'nat_string', 'strCustody', 'strProbono', 'case_type_string',
                   'ij_code', 'lawyer', 'defensive', 'affirmative', 'oral', 'written', 
                   'comp_year', 'osc_year', 'input_year', 'datAppealFiled_year', 'granted']
appeals_final = appeals_with_master[output_features].copy()

# define mandatory features, without which data will be dropped 
mandatory_features = ['idnAppeal', 'granted', 'datAppealFiled_year', 'case_type_string'] 
appeals_final.dropna(subset=mandatory_features, inplace=True)

# generate output 
appeals_final.to_csv('data_for_model/data_for_model_2018-03-23.csv', index=False)