In [5]:
import sys
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt 
import os
import datetime as datetime
pd.set_option('display.max_rows', 200)
%matplotlib inline

In [6]:
# define folder where data resides 
DATAFOLDER = "/Users/Colin/Google_Drive/ML_DSGA_1003/asylum_project"

## Read in Dunn Data

In [7]:
master_dunn = pd.read_csv(os.path.join(DATAFOLDER, 
                                       'raw/_decision_scheduling_merge_final_converted.csv'), 
                          encoding='latin-1', low_memory=False) # gets UnicodeDecodeError otherwise 
master_dunn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602500 entries, 0 to 602499
Columns: 182 entries, Unnamed: 0 to last_hearing_on_comp_date
dtypes: bool(1), float64(72), int64(39), object(70)
memory usage: 832.6+ MB


## Generate Prior Percentages

In [34]:
# prior percentages for aggregate
agt_priors = master_dunn['grant'].mean()
# overall grant percent
agt_priors
agt_counts = master_dunn.groupby(['grant']).count()['idncase']
agt_counts

grant
0    388769
1    213731
Name: idncase, dtype: int64

In [27]:
# prior percentages for nationality
nat_priors = master_dunn.groupby(['nat']).mean()['grant']
nat_priors.head()
nat_counts = master_dunn.groupby(['nat', 'grant']).count()['idncase']
nat_counts

nat  grant
??   0          131
     1           67
AB   0            1
AC   0           21
     1            3
AF   0         1872
     1         3444
AG   0          611
     1          359
AL   0         5845
     1         6308
AM   0         3770
     1         3254
AN   0            8
     1            8
AO   0          224
     1          128
AR   0         1104
     1          152
AS   0           40
     1            4
AU   0           30
     1            5
AZ   0          321
     1          462
BA   0           68
     1           19
BB   0           20
     1            2
BC   0           10
     1            3
BD   0            4
     1            3
BE   0           53
     1           35
BF   0          157
     1           21
BG   0         4249
     1         2017
BH   0          174
     1           26
BI   0          274
     1          209
BL   0          366
     1          131
BM   0          634
     1         1785
BN   0           61
     1           50
BO   0   

In [28]:
# prior percentages for affirmative vs defensive
affirm_priors = master_dunn.groupby(['affirmative']).mean()['grant']
affirm_priors.head()
affirm_counts = master_dunn.groupby(['affirmative', 'grant']).count()['idncase']
affirm_counts

affirmative  grant
0.0          0        197756
             1         76467
1.0          0        180196
             1        135187
Name: idncase, dtype: int64

In [11]:
# make dt col
master_dunn['input_dt'] = master_dunn[['input_year', 'input_month', 'input_day']]\
.apply(lambda x : '{}{}{}'.format(str(x[0]).split('.')[0],str(x[1]).split('.')[0],str(x[2]).split('.')[0]), axis=1)\
.apply(pd.to_datetime, format='%Y%m%d', errors='coerce')

In [12]:
# get day of the week
master_dunn['dow'] = master_dunn['input_dt'].apply(lambda x: x.weekday())

In [25]:
# prior percentage for DOW
dow_priors = master_dunn.groupby(['dow']).mean()['grant']
dow_priors
dow_counts = master_dunn.groupby(['dow', 'grant']).count()['idncase']
dow_counts

dow  grant
0.0  0        68955
     1        36881
1.0  0        73035
     1        39326
2.0  0        68900
     1        36346
3.0  0        71852
     1        38036
4.0  0        65985
     1        34894
5.0  0         1069
     1          453
6.0  0         1456
     1          691
Name: idncase, dtype: int64

In [14]:
# round adj_time_start down
def round_down(num, divisor):
    return num - (num%divisor)

In [15]:
# round time down to the hour
# 8:59 -> 8:00 for instance
master_dunn['rounded_time'] = master_dunn['adj_time_start'].apply(lambda x: round_down(x, 100))

In [29]:
# prior percentage for time
time_priors = master_dunn.groupby(['rounded_time']).mean()['grant']
time_priors
time_counts = master_dunn.groupby(['rounded_time', 'grant']).count()['idncase']
time_counts

rounded_time  grant
800           0         70063
              1         37446
900           0         96366
              1         46455
1000          0         44127
              1         24409
1100          0          5156
              1          4874
1200          0          2897
              1          2570
1300          0        129536
              1         69498
1400          0         28337
              1         17661
1500          0          9697
              1          6752
1600          0          2242
              1          3882
1700          0            37
              1            27
1800          0           206
              1            86
1900          0           105
              1            71
Name: idncase, dtype: int64

In [30]:
# language prior
language_priors = master_dunn.groupby(['lang']).mean()['grant']
language_priors.head()
language_counts = master_dunn.groupby(['lang', 'grant']).count()['idncase']
language_counts

lang  grant
???   0         954
      1         288
AAR   1           1
ACC   0           1
ACE   0           2
      1           1
ACH   0           4
      1           3
AFE   0           1
AFR   0           5
      1           3
AGC   0          40
      1           3
AGH   1           2
AKA   0          61
      1          18
AKU   0           1
      1           2
ALB   0        7680
      1        8211
ALG   0          18
      1          12
AMD   0           2
      1          15
AMH   0        2685
      1        4537
ANU   0           2
AR    0        6330
      1        6653
ARA   0          44
      1          43
ARK   0           4
      1          22
ARM   0        4466
      1        3759
ASH   0          44
      1          10
ASY   0         131
      1         291
AZJ   0          19
      1          39
BAG   0           5
      1           3
BAI   0           3
      1           8
BAJ   0           2
      1           5
BAM   0         349
      1         393
BAN   0 

In [31]:
# lawyer prior
lawyer_priors = master_dunn.groupby(['lawyer']).mean()['grant']
lawyer_priors
lawyer_counts = master_dunn.groupby(['lawyer', 'grant']).count()['idncase']
lawyer_counts

lawyer  grant
0       0         74587
        1          7914
1       0        314182
        1        205817
Name: idncase, dtype: int64

In [19]:
def calibrate_beta_priors(aggregate_mean): 
    """
    Takes aggregate rate and return Beta priors (alpha, beta) with prior mean approximating aggregate rate, 
    with effective sample size of 10 
    """
    
    rounded_rate = np.round(aggregate_mean, 1)
    alpha = int(rounded_rate * 10) 
    beta = 10 - alpha 
    
    return alpha, beta

In [20]:
def compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total): 
    """ 
    Takes Beta priors (alpha, beta) along with observed data (num_total, num_positives) 
    and returns posterior mean 
    """
    
    updated_alpha = alpha_prior + num_positives 
    updated_beta = beta_prior + num_total - num_positives 
    
    posterior_mean = float(updated_alpha) / (updated_alpha + updated_beta)
    
    return posterior_mean

In [21]:
def get_beta_adj_rate(aggregate_mean, num_positives, num_total): 
    """ 
    Takes aggregate mean as a float (from 0 to 1), num_total (integer), and num_positives (integer) 
    and return the 'Beta-adjusted' rate. 
    Example: if in total 30% of Chinese nationality cases were granted, and a specific judge saw 20 cases 
    and granted 14 of them, input aggregate_mean=0.3, num_total=20, and num_positives=14 
    """
    
    if type(aggregate_mean) is not float: 
        raise ValueError("Please enter a float for aggregate mean!")
        
    if aggregate_mean < 0 or aggregate_mean > 1: 
        raise ValueError("Aggregate mean must be between 0 and 1!")
            
    alpha_prior, beta_prior = calibrate_beta_priors(aggregate_mean)
    posterior_mean = compute_posterior_mean(alpha_prior, beta_prior, num_positives, num_total)
    
    return posterior_mean

In [36]:
# list of percent dfs
percent_list = [agt_priors, nat_priors, affirm_priors, dow_priors, time_priors, language_priors, lawyer_priors]
# list of count dfs
count_list = [agt_counts, nat_counts, affirm_counts, dow_counts, time_counts, language_counts, lawyer_counts]

In [75]:
total_counts = master_dunn.groupby(['ij_code']).count()['grant']
total_counts

ij_code
AA     1655
AAK     192
AAT     925
AAV    3022
ABM     313
ACB      75
ACH     181
ADM     146
ADP    1170
AED     489
AEG    1690
AJG       2
AJR    2798
ALP    1414
ALR     703
AMC     321
AMD     377
AMP       2
AND     132
AO     3360
ARA     388
ARD     148
ASE    5787
ASG    3040
ASL     628
ASM    1166
ATG    1061
AVP     617
BAN    4805
BAR    1362
BAT     357
BAZ     799
BHS    3336
BJE    2549
BJH    1954
BJP     223
BKS    3651
BLF    4866
BMB    5178
BMO     298
BMP    2285
BP1     472
BQM     124
BSC       9
BWD    1335
BWS    5700
CAB    1074
CAD     707
CAH      18
CAK    2086
CAL     180
CAS       6
CAW     629
CBA     925
CC     2143
CDB    1788
CEP    3014
CES    1795
CHC    2331
CJL     251
CJS    3431
CKA     254
CLR     206
CMH    3302
CMR    2351
CMW     164
CMZ    3162
CRH     977
DA     1015
DAL     296
DAM    2395
DAR     334
DB      222
DBP     168
DBS    4606
DCA    1428
DDB    1889
DDS    1248
DE        1
DFC     376
DH      581
DHP     216
DHS    1

In [76]:
granted_counts = master_dunn[master_dunn['grant'] == 1].groupby(['ij_code']).count()['grant']
granted_counts 

ij_code
AA      435
AAK     133
AAT     295
AAV     568
ABM     135
ACB      18
ACH     107
ADM      43
ADP     234
AED     216
AEG     938
AJR    1493
ALP     245
ALR      26
AMC      73
AMD      25
AMP       2
AND      40
AO      654
ARA      33
ARD      61
ASE    3125
ASG    1270
ASL     521
ASM     140
ATG     239
AVP     412
BAN    1388
BAR     425
BAT      41
BAZ     440
BHS     585
BJE     748
BJH    1404
BJP      28
BKS    1255
BLF    3715
BMB    1374
BMO     200
BMP     970
BP1     178
BQM      13
BWD     109
BWS    1025
CAB     197
CAD     239
CAH       1
CAK    1248
CAL      13
CAS       4
CAW      61
CBA     120
CC     1043
CDB     621
CEP    1227
CES     779
CHC     865
CJL     156
CJS     661
CKA     127
CLR      57
CMH    1179
CMR     441
CMW      64
CMZ    1251
CRH     168
DA      377
DAL     165
DAM     324
DAR     111
DB       52
DBP      64
DBS    3226
DCA     657
DDB     630
DDS     529
DFC     113
DH      275
DHP      70
DHS     165
DJC     729
DJS       3
DL      

In [78]:
merged_counts = pd.DataFrame({'total_count':total_counts, 'granted_count': granted_counts})
merged_counts

Unnamed: 0,granted_count,total_count
AA,435.0,1655
AAK,133.0,192
AAT,295.0,925
AAV,568.0,3022
ABM,135.0,313
ACB,18.0,75
ACH,107.0,181
ADM,43.0,146
ADP,234.0,1170
AED,216.0,489


In [79]:
merged_counts.to_csv('merged_counts.csv')