In [1]:
import numpy as np
import pandas as pd

In [2]:
#***********************************************************
#* first step to drop the 21 duplicates that are problematic in the merge (last merge before 
#* saving "data_simple"
#***********************************************************

In [3]:
dfdn_cln = pd.read_table("csv/Dfdn-cln.csv", sep = '^', dtype='object', index_col=False)
dfdn_cln=dfdn_cln.sort_values(['BOFI_NBR','ADDR_1'])
dfdn_nodup=dfdn_cln.drop_duplicates('BOFI_NBR')
print('%s rows deleted' %(dfdn_cln.shape[0]-dfdn_nodup.shape[0]))
#NOTE: ONLY 20 OBS DROPPED VS 21 IN STATA FILE

20 rows deleted


In [4]:
dfdn_nodup = dfdn_nodup[['BOFI_NBR', 'NAME', 'RESTRICTED_FLAG', 'JUVENILE_FLAG', 'CRIMINAL_FLAG', 'FBI_NBR', 'DOB', 'SEX', 'RACE']]

In [5]:
#*********************************************************
#* bring sentences in, add plea barganing (?):
#*********************************************************
#merge dsum-cln with sent-cln on bofi_nbr dfdn_seq_nbr sys_nbr
#delete if not in dsum-cln but in sent-cln
'''
Note: IDs
- bofi_nbr: defendant id
- sys_nbr: case id , it is the same for multiple offenders crime and for different charges
- case_nbr: it is essentially similar to sys_nbr except in 17 cases (8 times because case_nbr is missing)
- chrg_seq_nbr: suppose to identify the charges. Frequently there is only the nb 2 or 3 for eg. Probably because the others are dismissed...
- dfdt_seq_nbr: identify the defendant among a case with several defendant.

- disp_seq_nbr: ? (1 all the time?)
- phonic_key: ?
'''

'\nNote: IDs\n- bofi_nbr: defendant id\n- sys_nbr: case id , it is the same for multiple offenders crime and for different charges\n- case_nbr: it is essentially similar to sys_nbr except in 17 cases (8 times because case_nbr is missing)\n- chrg_seq_nbr: suppose to identify the charges. Frequently there is only the nb 2 or 3 for eg. Probably because the others are dismissed...\n- dfdt_seq_nbr: identify the defendant among a case with several defendant.\n\n- disp_seq_nbr: ? (1 all the time?)\n- phonic_key: ?\n'

In [6]:
dsum_cln = pd.read_table("csv/Dsum-cln.csv", sep = '^', dtype='object', index_col=False)
dsum_cln = dsum_cln[['ADA_CODE', 'AREG_SEQ_NBR', 'BOFI_NBR', 'DFDN_SEQ_NBR', 'SCREENING_DISP_CODE', 'SYS_NBR', 'POLICE_RPT_DATE', 'POLICE_RPT_DAYS', 'SCREENING_DAYS', 'SCREENING_DISP_DATE']]

In [7]:
data_merged = dsum_cln #start merged data with dsum_cln

In [8]:
#Changed cell below to raw nbconvert for first decision node

In [9]:
#*********************************************************
#* bring arrest date and crime category
#*********************************************************
#merge in [arrest_date charge_type charge_cat charge_class] from areg-cln
#delete if not in data_merged but in areg-cln
'''
label var charge_type "AR=at arrest, IF=by information, IN-by indictment"
label var charge_cat 
label var charge_class "severity of the charge from 1 sever to 8 lenient"
'''

'\nlabel var charge_type "AR=at arrest, IF=by information, IN-by indictment"\nlabel var charge_cat \nlabel var charge_class "severity of the charge from 1 sever to 8 lenient"\n'

In [10]:
areg_clean = pd.read_table("csv/Areg-cln.csv", sep = '^', dtype='object', index_col=False)

areg_clean = areg_clean[['ADA_CODE', 'AREG_SEQ_NBR', 'ARREST_CREDIT_CODE', 'ARREST_DATE', 'BOFI_NBR', 'SYS_NBR', 'CHARGE_CAT', 'CHARGE_CLASS', 'CHARGE_TYPE', 'DFDN_SEQ_NBR', 'HABITUAL_OFFENDER_FLAG', 'FINAL_DETENTION_FLAG', 'INITIAL_DETENTION_FLAG', 'LEAD_CHARGE_CODE']]

In [17]:
data_merged = pd.merge(dsum_cln, \
                 areg_clean, \
                 on=['BOFI_NBR','DFDN_SEQ_NBR','SYS_NBR', 'ADA_CODE', 'AREG_SEQ_NBR'], \
                 how='left')
print('%s rows' %(data_merged.shape[0]))

280294 rows


In [13]:
#Changed code below to rawnbconvert for the first decision node

In [14]:
#*********************************************************
#* bring SCREANER ada characteristics
#*********************************************************
#merge in ada_cln on ada_code
#delete if not in data_merged

In [15]:
ada_cln = pd.read_table("csv/Ada-cln.csv", sep = '^', \
                           dtype='object', index_col=False)
ada_cln = ada_cln[['ADA_CODE', 'BAR_ADMISSION', 'DOB', 'RACE', 'SEX', 'PARTY', 'ALT_PARTY']]

In [19]:
data_merged = pd.merge(data_merged, \
                 ada_cln,\
                 on='ADA_CODE', \
                 how='left')
print('%s rows' %(data_merged.shape[0]))

280294 rows


In [20]:
#change the name of the variable on ada to clarify that they are info on screaner ada
old_names = ['ADA_CODE', 'ADA_NAME', 'BAR_ADMISSION', 'DOB', 'RACE', 'SEX', 'PARTY', \
             'ALT_PARTY', 'LAW_SCHOOL', 'UNDERGRAD', 'UNVERIFIED_FLAG', 'INACTIVITY_DATE', \
             'DECEASED_FLAG', 'DISCIPLINARY_ACTION_FLAG', 'INEL_DUES_OR_RULES_FLAG', \
             'ADA_NOTES'] #removed TRIAL_ADA_CODE

new_names = ['SADA_CODE', 'SADA_NAME', 'SADA_BAR_ADMISSION', 'SADA_DOB', 'SADA_RACE', \
             'SADA_SEX', 'SADA_PARTY', 'SADA_ALT_PARTY', 'SADA_LAW_SCHOOL', 'SADA_UNDERGRAD', \
             'SADA_UNVERIFIED_FLAG', 'SADA_INACTIVITY_DATE', 'SADA_DECEASED_FLAG', \
             'SADA_DISCIPLINARY_ACTION_FLAG', 'SADA_INEL_DUES_OR_RULES_FLAG', 'SADA_NOTES']

name_dict={}

for i in range(0,len(old_names)):
    name_dict[old_names[i]]=new_names[i]

data_merged = data_merged.rename(columns=name_dict)

In [None]:
#*********************************************************
#* bring TRIAL ada characteristics
#*********************************************************
#merge in ada_cln on ada_code
#delete if not in data_merged

'''TRIAL_ADA_CODE was renamed to ADA_CODE.  We are now merging on TRIAL_ADA_CODE'''

In [None]:
#Changed code below to rawnbconvert for the first decision node

In [None]:
#*********************************************************
#* bring information on defendant
#*********************************************************
#merge in dfdn_nodup on bofi_nbr
#Inner join

'''DELETED VARIABLES IN STATA CODE ARE NOT IN DATA_MERGED TABLE, COULD BE FROM 
   JUDGE CHARACTERISTICS TABLE?'''

In [21]:
#data_merged = data_merged.drop(['DFDN_NAME', 'ADDR_1', 'ADDR_2', 'CITY', 'STATE', 'ZIP', \
#                                'CUSTODY_CODE', 'HOME_PHONE_NBR', 'BUSINESS_PHONE_NBR', \
#                                'DFDN_SSN', 'RESTRICTED_FLAG', 'JUVENILE_FLAG', \
#                                'CRIMINAL_FLAG', 'ALIAS_FLAG', 'DR_LIC_NBR', 'DR_LIC_STATE', \
#                                'DOB', 'DOB_PLACE', 'DFDN_HEIGHT_FEET', 'DFDN_HEIGHT_INCHES', \
#                                'DFDN_WEIGHT', 'DFDN_SEX', 'DFDN_RACE', 'DFDN_SKIN_COLOR', \
#                                'DFDN_EYE_COLOR', 'DFDN_HAIR_COLOR', 'FINGERPRINT_CLASS', \
#                                'DFDN_SMT', 'ADD_DATE', 'ADD_TIME', 'ADD_USER', 'CHANGE_DATE', \
#                                'CHANGE_TIME', 'CHANGE_USER', 'PHONIC_KEY', 'FBI_NBR', \
#                                'CCN_NBR', 'DFDN_DOB'], axis=1)
data_simple = pd.merge(data_merged, \
                 dfdn_nodup,\
                 on='BOFI_NBR', \
                 how='inner')
print('%s rows' %(data_simple.shape[0]))


280294 rows


In [22]:
data_simple.to_csv('data_simple.csv')

In [23]:
data_simple.groupby('SCREENING_DISP_CODE').BOFI_NBR.nunique()

SCREENING_DISP_CODE
017        1
030        2
050       14
060        3
120        1
130        2
140        3
150        1
160        9
230    65802
240    60397
250      142
260    27795
280        8
310        1
320        1
560        2
Name: BOFI_NBR, dtype: int64

In [39]:
len(data_simple)

280294

In [40]:
data_simple.count()

SADA_CODE                 253057
AREG_SEQ_NBR              280294
BOFI_NBR                  280293
DFDN_SEQ_NBR              280294
SCREENING_DISP_CODE       244779
SYS_NBR                   280294
POLICE_RPT_DATE           280294
POLICE_RPT_DAYS           280294
SCREENING_DAYS            280294
SCREENING_DISP_DATE       280294
ARREST_CREDIT_CODE        146792
ARREST_DATE               150979
CHARGE_CAT                 50555
CHARGE_CLASS              148545
CHARGE_TYPE               150448
HABITUAL_OFFENDER_FLAG    105433
FINAL_DETENTION_FLAG      150979
INITIAL_DETENTION_FLAG    140062
LEAD_CHARGE_CODE          150453
SADA_BAR_ADMISSION        249407
SADA_DOB                  243862
SADA_RACE                 241393
SADA_SEX                  251302
SADA_PARTY                224099
SADA_ALT_PARTY               994
NAME                      280294
RESTRICTED_FLAG           280294
JUVENILE_FLAG             280294
CRIMINAL_FLAG             280294
FBI_NBR                   107471
DOB       