In [57]:
import pandas as pd
import numpy as np

merged_data = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False, index_col=0)

In [4]:
merged_data.columns

Index(['ADA_CODE', 'ADD_DATE', 'ARREST_CREDIT_CODE', 'ARREST_DATE',
       'BAR_ADMISSION', 'BOFI_NBR', 'CHARGE_CLASS', 'CHARGE_TYPE',
       'CRIMINAL_FLAG', 'DFDN_SEQ_NBR', 'DOB', 'FBI_NBR',
       'FINAL_DETENTION_FLAG', 'HABITUAL_OFFENDER_FLAG',
       'INITIAL_DETENTION_FLAG', 'JUVENILE_FLAG', 'LEAD_CHARGE_CODE', 'PARTY',
       'POLICE_RPT_DATE', 'POLICE_RPT_DAYS', 'RACE', 'SADA_DOB', 'SADA_RACE',
       'SADA_SEX', 'SCREENING_DAYS', 'SCREENING_DISP_CODE',
       'SCREENING_DISP_DATE', 'SEX', 'SYS_NBR'],
      dtype='object')

In [5]:
categorical = ['ARREST_CREDIT_CODE', 'CHARGE_CLASS', 'CHARGE_TYPE', 'LEAD_CHARGE_CODE',\
               'PARTY', 'RACE', 'SADA_RACE']

In [6]:
dict_counts = {}

for columnname in categorical:
    dict_counts[columnname] = merged_data[columnname].value_counts()

In [173]:
len(merged_data['ARREST_CREDIT_CODE'].unique())

103

In [7]:
merged_data[categorical].isnull().sum()

ARREST_CREDIT_CODE    10878
CHARGE_CLASS           4602
CHARGE_TYPE             581
LEAD_CHARGE_CODE        552
PARTY                 56195
RACE                   4821
SADA_RACE             38901
dtype: int64

In [60]:
merged_cat_filled = merged_data[categorical].fillna('NA') #fill NA's so that one-hot encoding creates dummy for "NA"

merged_cat_filled.isnull().sum()

ARREST_CREDIT_CODE    0
CHARGE_CLASS          0
CHARGE_TYPE           0
LEAD_CHARGE_CODE      0
PARTY                 0
RACE                  0
SADA_RACE             0
dtype: int64

### Remove invalid values based on information in codebook

In [10]:
invalid_charge_codes = ['40:(979)296', 
               '14:(24)30(',
               '14:(24)67(',
               '5:606',
               '13:34',
               'F5:257',
               '14:(26)67(',
               '40:(979)1967',
               '4:664']

In [11]:
merged_cat_filled[merged_cat_filled['LEAD_CHARGE_CODE'].isin(invalid_charge_codes) == True]

Unnamed: 0_level_0,ARREST_CREDIT_CODE,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE,PARTY,RACE,SADA_RACE
UNIQUE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5207,,6.0,,5:606,D,,W
163070,05,,,40:(979)1967,D,N,W
196149,AL,4.0,AR,F5:257,R,,W
202726,05,,AR,13:34,D,N,B
203134,18,,AR,4:664,R,O,W
221631,56,,AR,40:(979)296,D,N,B
241333,07,,AR,14:(24)30(,D,W,W
241335,07,,AR,14:(24)30(,D,W,W
242808,22,2.0,AR,40:(979)296,D,W,T
248777,08,,AR,14:(24)67(,R,W,W


In [12]:
merged_cat_filled['LEAD_CHARGE_CODE'].replace(invalid_charge_codes, 'NA', inplace=True)

In [13]:
merged_cat_filled['RACE'].replace('N', 'B', inplace=True)
#the codebook says that the value A should not be accepted, but this refers to "asian" elsewhere
#so I'm hesitant to throw out the A's
merged_cat_filled['RACE'].replace('O','A', inplace=True)

In [14]:
merged_cat_filled['ARREST_CREDIT_CODE'].replace('00', 'NA', inplace=True)

### Import table Code to verify interpretation of each category
#### ARCDCD in Code table: Arrest credit code
#### CGCD table: Lead charge code
#### CSCLC in Code table: Charge class

#### Lead Charge Code

In [16]:
CGCD = pd.read_stata("cgcd-cln.dta")
CGCD.to_csv('cgcd-cln.csv')

In [17]:
CGCD.head()

Unnamed: 0,charge_code,charge_cat,charge_class,charge_desc,expiration_date
0,CJP13(13),,6,IN NEED OF SUPERVISION,0
1,CJP13(14),,6,IN NEED OF CARE,0
2,06:668,GEN,4,OBTN LOAN/CRED. UNION BY FRAUD,0
3,12:315,GEN,4,MAIL FRAUD,0
4,12:501,GEN,4,CHARITY FAILURE TO FILE,0


In [18]:
Code = pd.read_stata("code-cln.dta")

Code.to_csv('code-cln.csv')

In [19]:
Code.head()

Unnamed: 0,code_type,code_code,short_desc,long_desc,add_date,add_time,add_user,change_date,change_time,change_user
0,ASDACD,ADOU,"DOUGLAS, A","DOUGLAS, ANDREW",0,0,,0,0,
1,ASDACD,AGOT,GOTTLIEB,"GOTTLIEB, ADRIAN",0,0,,0,0,
2,ASDACD,AGRE,"GREEN, A","GREEN, ALAN",0,0,,0,0,
3,ASDACD,ALAC,,"LACOUR, ADRIENNE",0,0,,0,0,
4,ASDACD,ALAM,"LAMBERT, A","LAMBERT, ANNE",0,0,,0,0,


In [24]:
merged_cat_filled = pd.merge(merged_cat_filled, CGCD[['charge_code', 'charge_class', 'charge_desc']], \
                             left_on=['LEAD_CHARGE_CODE'], right_on=['charge_code'], how='left')

In [188]:
len(merged_cat_filled.loc[merged_cat_filled['charge_code'].isnull() == True,'LEAD_CHARGE_CODE'].unique())

93

In [26]:
len(merged_cat_filled.loc[merged_cat_filled['charge_code'].isnull() == True,'LEAD_CHARGE_CODE'])

766

In [189]:
len(merged_cat_filled.loc[merged_cat_filled['charge_code'].isnull() == False,'LEAD_CHARGE_CODE'].unique())

846

766 rows with 93 unique lead charge codes that don't appear in the lookup table.
202 rows and 92 unique values excluding null charge codes.

In [28]:
unmatched = merged_cat_filled.loc[merged_cat_filled['charge_code'].isnull() == True,'LEAD_CHARGE_CODE'].unique()

In [42]:
#Find most common unmatched values

merged_cat_filled[merged_cat_filled['LEAD_CHARGE_CODE'].isin(unmatched)]\
.groupby(['LEAD_CHARGE_CODE']).count().sort(['PARTY'], ascending=0)

  if __name__ == '__main__':


Unnamed: 0_level_0,ARREST_CREDIT_CODE,CHARGE_CLASS,CHARGE_TYPE,PARTY,RACE,SADA_RACE,charge_code,charge_class,charge_desc
LEAD_CHARGE_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,564,564,564,564,564,564,0,0,0
14:95 (A) (3,25,25,25,25,25,25,0,0,0
7:730.3,24,24,24,24,24,24,0,0,0
7:730.1,12,12,12,12,12,12,0,0,0
40:971.B,10,10,10,10,10,10,0,0,0
14:92 (A) (9,9,9,9,9,9,9,0,0,0
40:967 (C) (,7,7,7,7,7,7,0,0,0
7:730.6,5,5,5,5,5,5,0,0,0
14:(27) 110,4,4,4,4,4,4,0,0,0
40:966 (D) (,4,4,4,4,4,4,0,0,0


In [45]:
#Possible to identify what these values are most likely supposed to be, but not necessarily worth it for so few rows

CGCD[CGCD.charge_code == '14:95 (A) (3)']

Unnamed: 0,charge_code,charge_cat,charge_class,charge_desc,expiration_date
530,14:95 (A) (3),,4,POSSESION OF BURGLARY TOOLS,0


In [47]:
CGCD[CGCD.charge_code.str.contains('7:730')]

Unnamed: 0,charge_code,charge_cat,charge_class,charge_desc,expiration_date
932,7:730 (1),FINS,6,FAMILY IN NEED OF SERVICES/TRUANCY,0
933,7:730 (2),FINS,6,FAMILY IN NEED OF SERVICES/UNGOVERNABLE,0
934,7:730 (3),FINS,6,FAMILY IN NEED OF SERVICES/RUNAWAY,0
984,7:730 (4),FINS,6,FAMILY IN NEED OF SERVICES/ALCOHOL RELATED,0
985,7:730 (5),FINS,6,FAMILY IN NEED OF SERVICES/CHILD OFFENSES,0
986,7:730 (6),FINS,6,FAMILY IN NEED OF SERVICES/CHILD UNDER 10,0
987,7:730 (7),FINS,6,FAMILY IN NEED OF SERVICES/CARETAKER OFFENSES,0
1098,7:730,JUV,6,FAMILY IN NEED OF SERVICES,0


In [51]:
#Change Lead Charge Code to NA if not found in CGCD

merged_cat_filled.loc[merged_cat_filled['LEAD_CHARGE_CODE'].isin(unmatched),'LEAD_CHARGE_CODE'] = 'NA'

#### Instances where Charge Class of Lead Charge Code Does Not Match Charge Class from AREG table

In [53]:
non_null_cc = merged_cat_filled.loc[(merged_cat_filled['CHARGE_CLASS'] != 'NA') \
                                    & (merged_cat_filled['charge_class'].isnull() == False)\
                                    & (merged_cat_filled['charge_class'] != ''), ]

In [54]:
non_null_cc.loc[non_null_cc['CHARGE_CLASS'].astype(int).astype(str) != non_null_cc['charge_class'].astype(str),]

Unnamed: 0,ARREST_CREDIT_CODE,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE,PARTY,RACE,SADA_RACE,charge_code,charge_class,charge_desc
26,72,2,AR,40:967 (C) (2),R,B,W,40:967 (C) (2),3,POSS SCHEDULE 2 OTHER
72,72,3,AR,14:110 (A),R,B,W,14:110 (A),4,SIMPLE ESCAPE
197,31,3,AR,14:108,R,B,W,14:108,4,RESISTING AN OFFICER
242,05,2,AR,40:967 (C) (2),R,B,W,40:967 (C) (2),3,POSS SCHEDULE 2 OTHER
323,94,4,IF,40:969(C),,,,40:969(C),3,POSS SCHEDULE 4
372,,3,IF,14:110 (A),,B,,14:110 (A),4,SIMPLE ESCAPE
373,06,2,AR,40:967 (C) (2),R,B,W,40:967 (C) (2),3,POSS SCHEDULE 2 OTHER
701,22,2,AR,40:967 (C) (2),,B,,40:967 (C) (2),3,POSS SCHEDULE 2 OTHER
713,01,4,IF,40:969(C),,,,40:969(C),3,POSS SCHEDULE 4
723,22,4,AR,40:969(C),,W,W,40:969(C),3,POSS SCHEDULE 4


3,017 instances where the charge class associated with the lead_charge_code in the lookup table isn't the same as the charge class in areg

#### Arrest Credit Code

In [21]:
ARCDCD = Code.loc[Code['code_type'] == 'ARCDCD',]

In [224]:
merged_cat_filled = pd.merge(merged_cat_filled, ARCDCD[['code_code', 'long_desc']], \
                             left_on=['ARREST_CREDIT_CODE'], right_on=['code_code'], how='left')

In [226]:
merged_cat_filled.loc[merged_cat_filled['code_code'].isnull() == True,'ARREST_CREDIT_CODE'].unique()

array(['NA'], dtype=object)

#### The only arrest credit codes that don't align with the lookup table is "NA", so it seems there aren't any further data quality issues with this column

#### Charge Class (The Charge Class on AREG table)

In [237]:
CSCLCD = Code.loc[Code['code_type'] == 'CSCLCD',]

In [238]:
CSCLCD

Unnamed: 0,code_type,code_code,short_desc,long_desc,add_date,add_time,add_user,change_date,change_time,change_user
884,CSCLCD,1.0,CLASS 1,CAPITAL,19920802,12000000,MAP,19920802,12000000,MAP
885,CSCLCD,2.0,CLASS 2,"FELONY, HARD LABOR",19920802,12000000,MAP,19920802,12000000,MAP
886,CSCLCD,3.0,CLASS 3,"FELONY, OPTIONAL",19920802,12000000,MAP,19920802,12000000,MAP
887,CSCLCD,4.0,CLASS 4,MISDEMEANOR,19920802,12000000,MAP,19920802,12000000,MAP
903,CSCLCD,,,CASE CLASS CODES,0,0,,0,0,
1658,CSCLCD,5.0,CLASS 5,ASSET FORFEITURE,19931229,12000000,RBW,19960702,15000000,DPS
1659,CSCLCD,6.0,CLASS 6,JUVENILE STATUS,19931229,12000000,RBW,19960701,15000000,DPS
1664,CSCLCD,7.0,CLASS 7,POLICE STATUS,19940301,12000000,RBW,19960702,15000000,DPS
2116,CSCLCD,8.0,CLASS 8,MULTIPLE CLASS CHARGES IN CASE,1141998,0,SMR,0,0,


In [239]:
merged_cat_filled['CHARGE_CLASS'].unique()

array([3.0, 4.0, 2.0, 'NA', 1.0, 6.0, 8.0, 5.0, 7.0], dtype=object)

#### All the values in the merged file could be mapped to something in CSCLCD, so no further data quality issues with this column

### One Hot Encoding

In [58]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [59]:
'''
create features using one hot encoding
'''

# first convert into integer values. one hot enconding only takes int input

l_enc = LabelEncoder()

col1 = l_enc.fit_transform(merged_cat_filled['CHARGE_CLASS'].astype(str))
col2 = l_enc.fit_transform(merged_cat_filled['CHARGE_TYPE'])
col3 = l_enc.fit_transform(merged_cat_filled['LEAD_CHARGE_CODE'])
col4 = l_enc.fit_transform(merged_cat_filled['PARTY'])
col5 = l_enc.fit_transform(merged_cat_filled['RACE'])
col6 = l_enc.fit_transform(merged_cat_filled['SADA_RACE'])
col7 = l_enc.fit_transform(merged_cat_filled['ARREST_CREDIT_CODE'])

X = np.column_stack((col1,col2,col3,col4,col5,col6,col7))

#one hot encoding
enc = OneHotEncoder()
X_enc = enc.fit_transform(X)