In [3]:
import sys
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
import datetime as datetime
pd.set_option('display.max_rows', 200)
%matplotlib inline

In [4]:
# define folder where data resides 
DATAFOLDER = "/Users/Colin/Google_Drive/ML_DSGA_1003/asylum_project"

## Read in Dunn Data

In [6]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'raw/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602500 entries, 0 to 602499
Columns: 182 entries, Unnamed: 0 to last_hearing_on_comp_date
dtypes: bool(1), float64(72), int64(39), object(70)
memory usage: 832.6+ MB


## Generate Prior Percentages

In [16]:
# prior percentages for aggregate
agt_prior = master_dunn['grant'].mean()
# overall grant percent
agt_prior

0.3547402489626556

In [10]:
# prior percentages for nationality
nat_priors = master_dunn.groupby(['nat']).mean()['grant']
nat_priors.head()

nat
??    0.338384
AB    0.000000
AC    0.125000
AF    0.647856
AG    0.370103
Name: grant, dtype: float64

In [12]:
# prior percentages for affirmative vs defensive
affirm_priors = master_dunn.groupby(['affirmative']).mean()['grant']
affirm_priors.head()

affirmative
0.0    0.278850
1.0    0.428644
Name: grant, dtype: float64

In [23]:
# make dt col
master_dunn['input_dt'] = master_dunn[['input_year', 'input_month', 'input_day']]\
.apply(lambda x : '{}{}{}'.format(str(x[0]).split('.')[0],str(x[1]).split('.')[0],str(x[2]).split('.')[0]), axis=1)\
.apply(pd.to_datetime, format='%Y%m%d', errors='coerce')

In [32]:
# get day of the week
master_dunn['dow'] = master_dunn['input_dt'].apply(lambda x: x.weekday())

In [35]:
# prior percentage for DOW
dow_priors = master_dunn.groupby(['dow']).mean()['grant']
dow_priors

dow
0.0    0.348473
1.0    0.349997
2.0    0.345343
3.0    0.346134
4.0    0.345900
5.0    0.297635
6.0    0.321844
Name: grant, dtype: float64

In [47]:
# round adj_time_start down
def round_down(num, divisor):
    return num - (num%divisor)

In [49]:
# round time down to the hour
# 8:59 -> 8:00 for instance
master_dunn['rounded_time'] = master_dunn['adj_time_start'].apply(lambda x: round_down(x, 100))

In [52]:
# prior percentage for time
time_priors = master_dunn.groupby(['rounded_time']).mean()['grant']
time_priors

rounded_time
800     0.348306
900     0.325267
1000    0.356149
1100    0.485942
1200    0.470093
1300    0.349177
1400    0.383951
1500    0.410481
1600    0.633899
1700    0.421875
1800    0.294521
1900    0.403409
Name: grant, dtype: float64

In [56]:
# language prior
language_priors = master_dunn.groupby(['lang']).mean()['grant']
language_priors.head()

lang
???    0.231884
AAR    1.000000
ACC    0.000000
ACE    0.333333
ACH    0.428571
Name: grant, dtype: float64

In [54]:
# lawyer prior
lawyer_priors = master_dunn.groupby(['lawyer']).mean()['grant']
lawyer_priors

lawyer
0    0.095926
1    0.395803
Name: grant, dtype: float64

child
0    0.35474
Name: grant, dtype: float64