In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import project_env as pe
import sklearn as sklearn
from scipy import stats
%matplotlib inline

In [2]:
data = pd.read_csv('merged.csv', encoding = "ISO-8859-1", low_memory=False)
data = data[data['NEXT_ARREST_TIME']!='Delete']
data['NEXT_ARREST_TIME'] = data['NEXT_ARREST_TIME'].apply(pd.to_numeric)
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False)

In [3]:
data = pd.merge(data, \
                 data_simple[['UNIQUE_ID','ADA_CODE']],\
                 on='UNIQUE_ID', \
                 how='left')

In [4]:
data['CHRG_ACCEPTED'] = np.where(data['SCREENING_DISP_CODE']==230,1,0)
data['CHRG_REJECTED'] = np.where(data['SCREENING_DISP_CODE']!=230,1,0)

Cleaning Notes: Most of the bad values do not fit the pattern for the ADA codes. Of those that do, ABRE and
VSAL aren’t listed in the CODE table, but could be AGRE and VSOL, which are ADA codes. These are potential
typos. NARC is a valid code for a different type of variable (drug code), and could be MARC. It appears often
enough that it might not be a typo, and could, instead, indicate confusion on the part of the data entry personel. In
other tables Judge codes have been found in fields that should contain substance codes, and vice-versa, so this does
occur elsewhere. The decision is, however, left to the user, in that the NARC values are considered invalid.
Unaccepted Values: NARC[560]; [Missing][124]; VSAL[6]; NONE[1]; S828672[1]; EWU385[1]; A561575[1];
ABRE[1]

In [14]:
#drop invalid ADA_CODES
invalid_ada_code = ['NARC','VSAL','NONE','S828672','EWU385','A561575','ABRE']
data = data.loc[~data['ADA_CODE'].isin(invalid_ada_code)].dropna(subset=['ADA_CODE'])

In [15]:
by_sada = data.groupby(['ADA_CODE'])['CHRG_ACCEPTED','CHRG_REJECTED'].sum()

In [16]:
by_sada['NBR_CASES'] = by_sada['CHRG_ACCEPTED'] + by_sada['CHRG_REJECTED'] 
by_sada['CHRG_RATE'] = by_sada['CHRG_ACCEPTED']/(by_sada['NBR_CASES'])

Max # of cases seen by a screener with charge rate = 1 is 14. <br>
Max # of cases seen by a screener with charge rate = 0 is 580.

In [24]:
print('Total Screeners: ',by_sada.shape[0])
print('Screeners that have seen over 100 cases: ',by_sada[by_sada['NBR_CASES']>=100].shape[0])
print('Screeners that have seen over 500 cases: ',by_sada[by_sada['NBR_CASES']>=500].shape[0])
print('Mode Cases Seen: ',stats.mode(by_sada['NBR_CASES'])[0])
print('Median Cases Seen: ',np.median(by_sada['NBR_CASES']))
print('Average Cases Seen: ',np.mean(by_sada['NBR_CASES']))

Total Screeners:  250
Screeners that have seen over 100 cases:  123
Screeners that have seen over 500 cases:  78
Mode Cases Seen:  [1]
Median Cases Seen:  92.5
Average Cases Seen:  818.744


# Rearrest rate from human screener decisions

In [51]:
not_charged = data[data['CHRG_ACCEPTED']==0]

In [54]:
years = [1,2,3,4,5]
for year in years:
    data_trunc = not_charged[not_charged['ARREST_DATE_y'] <= 1999 - year]
    total = data_trunc['SCREENING_DISP_CODE'].count()
    rearrest_cnt = data_trunc[(data_trunc['NEXT_ARREST_TIME'] >0)&(data_trunc['NEXT_ARREST_TIME'] <= year*365)]\
    ['SCREENING_DISP_CODE'].count()
    rearrest_rate = rearrest_cnt/total
    print ('%.0f%% were rearrested within %s years' %(rearrest_rate*100, year))
    

30% were rearrested within 1 years
40% were rearrested within 2 years
47% were rearrested within 3 years
50% were rearrested within 4 years
53% were rearrested within 5 years


# Rearrest rate by leniency

Only considering screeners who have seen over 100 cases.

In [26]:
by_sada[by_sada['NBR_CASES']>=100].index

Index(['ADAI', 'ADIC', 'ALMB', 'ALOG', 'AMON', 'ANEL', 'AWAL', 'BBER', 'BDEA',
       'BLAN',
       ...
       'SRIC', 'SSCI', 'SSMI', 'SWOO', 'TBLO', 'TFAI', 'TSHA', 'VSOL', 'WEIK',
       'YBOL'],
      dtype='object', name='ADA_CODE', length=123)

In [46]:
sada = pd.merge(data, \
                 by_sada[by_sada['NBR_CASES']>=100][['NBR_CASES','CHRG_RATE']],\
                 left_on ='ADA_CODE',\
                right_index = True, \
                 how='inner')

In [47]:
sada['quantile'] = pd.qcut(sada['CHRG_RATE'],5,labels=False)

In [48]:
sada.groupby(['quantile'])['CHRG_RATE'].mean()

quantile
0    0.234714
1    0.357106
2    0.399153
3    0.464867
4    0.620685
Name: CHRG_RATE, dtype: float64

In [50]:
sada_not_charged = sada[sada['CHRG_ACCEPTED']==0]

In [60]:
rearrest_rates = np.zeros((5,5))
years = [1,2,3,4,5]
for q in range(5):
    for year in range(1,6):
        data_trunc = sada_not_charged[(sada_not_charged['ARREST_DATE_y'] <= 1999 - year)&\
                                     (sada_not_charged['quantile']==q)]
        total = data_trunc['SCREENING_DISP_CODE'].count()
        rearrest_cnt = data_trunc[(data_trunc['NEXT_ARREST_TIME'] >0)&(data_trunc['NEXT_ARREST_TIME'] <= year*365)]\
        ['SCREENING_DISP_CODE'].count()
        rearrest_rates[q][year-1] = rearrest_cnt/total
        #print ('Quantile %s: %.0f%% were rearrested within %s years' %(q, rearrest_rate*100, year))
    

In [65]:
pd.DataFrame(rearrest_rates, \
             index = ['Q1','Q2','Q3','Q4','Q5'], \
             columns = ['1_year','2_year','3_year','4_year','5_year'])

Unnamed: 0,1_year,2_year,3_year,4_year,5_year
Q1,0.289281,0.37522,0.446824,0.476517,0.493578
Q2,0.332992,0.432681,0.479752,0.504869,0.532184
Q3,0.300173,0.414088,0.481058,0.525916,0.555955
Q4,0.290067,0.39794,0.46175,0.504,0.536414
Q5,0.273728,0.391407,0.447419,0.486601,0.517236
