In [1]:
# NOTE: This data merge requires that all source CSV files
# are in a folder named "csv" that resides in the same 
# directory as this .py file. Individual CSV files names 
# have not been changed from their version in the  
# original data collection

# The output of this script will be a database called 
# data_simple.csv that can be used for the prediction model

import numpy as np
import pandas as pd

#############################################################
# Rename ADA attributes to distinguish them from defendants #
#############################################################

old_names = ['DOB', 'RACE', 'SEX']

new_names = ['SADA_DOB', 'SADA_RACE', 'SADA_SEX']

name_dict=dict(zip(old_names, new_names))


##########################################
# Import and process relevant data files #
##########################################
# Defendant summary related to charges
dsum = pd.read_table("source/Dsum-cln.csv", sep = '^', \
		     dtype='object', index_col=False)

# Arrest registry
areg = pd.read_table("source/Areg-cln.csv", sep = '^', \
		     dtype='object', index_col=False)

# District Attorney information
ada  = pd.read_table("source/Ada-cln.csv", sep = '^', \
		     dtype='object', index_col=False)

# Defendant history
dfdn = pd.read_table("source/Dfdn-cln.csv", sep = '^', \
             dtype='object', index_col=False) \
		     .sort_values(['BOFI_NBR','ADDR_1']) \
		     .drop_duplicates('BOFI_NBR')


##################################
# Select attrtibutes of interest #
##################################

dsum_cln = dsum[['ADA_CODE', 'BOFI_NBR', 'DFDN_SEQ_NBR', \
		 'SCREENING_DISP_CODE', 'SYS_NBR', \
		 'POLICE_RPT_DATE', 'POLICE_RPT_DAYS', \
		 'SCREENING_DAYS', 'SCREENING_DISP_DATE']]

areg_cln = areg[['ARREST_CREDIT_CODE', \
		 'ARREST_DATE', 'ADD_DATE', 'BOFI_NBR', \
		 'SYS_NBR', 'CHARGE_CLASS', 'CHARGE_TYPE', \
		 'DFDN_SEQ_NBR', 'HABITUAL_OFFENDER_FLAG', \
		 'FINAL_DETENTION_FLAG', 'INITIAL_DETENTION_FLAG', \
		 'LEAD_CHARGE_CODE']]

ada_cln  = ada[['ADA_CODE', 'BAR_ADMISSION', 'DOB', \
		'RACE', 'SEX', 'PARTY']]

dfdn_cln = dfdn[['BOFI_NBR', 'JUVENILE_FLAG', 'CRIMINAL_FLAG', \
		 'FBI_NBR', 'DOB', 'SEX', 'RACE']]


######################################
# Merge all data into one data frame #
######################################

data_merged = pd.merge(dsum_cln, areg_cln, \
		on=['BOFI_NBR', 'DFDN_SEQ_NBR', \
		    'SYS_NBR'], how='left')


#New ADA names applied here
data_merged = pd.merge(data_merged, ada_cln, \
		on='ADA_CODE', how='left') \
		.rename(columns=name_dict)

data_simple = pd.merge(data_merged, dfdn_cln, \
		on='BOFI_NBR', how='inner')

###########################################
# Add ID column, sort, and export to file #
###########################################
data_simple['UNIQUE_ID'] = pd.Series(np.arange(data_simple.shape[0]))
cols = list(np.sort(data_simple.columns.values))
cols.insert(0,(cols.pop()))
data_simple = data_simple.ix[:,cols]
data_simple.to_csv('output/data_simple.csv', index=False)

In [2]:
data_simple.isnull().sum()

UNIQUE_ID                      0
ADA_CODE                   27237
ADD_DATE                       4
ARREST_CREDIT_CODE         10878
ARREST_DATE                    4
BAR_ADMISSION              30887
BOFI_NBR                       1
CHARGE_CLASS                4602
CHARGE_TYPE                  581
CRIMINAL_FLAG                  0
DFDN_SEQ_NBR                   0
DOB                            0
FBI_NBR                   172823
FINAL_DETENTION_FLAG           4
HABITUAL_OFFENDER_FLAG     60732
INITIAL_DETENTION_FLAG     15469
JUVENILE_FLAG                  0
LEAD_CHARGE_CODE             552
PARTY                      56195
POLICE_RPT_DATE                0
POLICE_RPT_DAYS                0
RACE                        4821
SADA_DOB                   36432
SADA_RACE                  38901
SADA_SEX                   28992
SCREENING_DAYS                 0
SCREENING_DISP_CODE        35515
SCREENING_DISP_DATE            0
SEX                         4775
SYS_NBR                        0
dtype: int

In [3]:
data_simple.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
UNIQUE_ID,280294,,,,140146.0,80914.1,0.0,70073.2,140146.0,210220.0,280293.0
ADA_CODE,253057,302.0,TFAI,17094.0,,,,,,,
ADD_DATE,280290,4605.0,0,2027.0,,,,,,,
ARREST_CREDIT_CODE,269416,102.0,05,36905.0,,,,,,,
ARREST_DATE,280290,5577.0,0,8202.0,,,,,,,
BAR_ADMISSION,249407,87.0,19911011,29567.0,,,,,,,
BOFI_NBR,280293,124848.0,352166,165.0,,,,,,,
CHARGE_CLASS,275692,8.0,3,119568.0,,,,,,,
CHARGE_TYPE,279713,3.0,AR,155957.0,,,,,,,
CRIMINAL_FLAG,280294,2.0,Y,223882.0,,,,,,,
