In [1]:
from copy import deepcopy
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_pickle('../Pickles/df_clean.pkl')

In [4]:
df.columns

Index(['highest_charge', 'offense_title', 'sentence_phase', 'sentence_type',
       'age_at_incident', 'gender', 'race', 'current_sentence', 'num_charges',
       'trial_outcome', 'class_adj', 'sentence_years', 'years_until_charged',
       'case_years'],
      dtype='object')

In [5]:
# created dataframes for dummy variables for highest_charge and sentence_phase, sentence_type, gender, race, current_sentence, trial_outcome and class_adj.
# joined dummy variable dataframes with main dataframe.

highest_charge = pd.get_dummies(df['highest_charge'])
highest_charge.drop(False, axis = 1, inplace = True)
highest_charge.rename({True: 'highest_charge_true'}, axis = 1, inplace = True)

sentence_phase = pd.get_dummies(df['sentence_phase'])
sentence_phase.columns = ['amended/corrected_sentence_phase', 'original_sentence_phase',
'probation_violation_sentence_phase', 'remanded_sentence_phase', 'resentenced_sentence_phase']

sentence_type = pd.get_dummies(df['sentence_type'])
sentence_type.columns = ['conditional_discharge_sentence','conversion_sentence',
'boot_camp_sentence', 'death_sentence', 'jail_sentence', 'prison_sentence',
'probation_sentence', 'supervision_sentence']
    
gender = pd.get_dummies(df['gender'])
gender.rename({'Male': 'male', 'Female': 'female'}, axis = 1, inplace = True)
gender.drop('female', axis = 1, inplace = True)

race = pd.get_dummies(df['race'])
race.columns = ['asian', 'biracial', 'black', 'hispanic', 'unknown_race', 'white']
# race.drop('unknown_race', axis = 1, inplace = True)

current_sentence = pd.get_dummies(df['current_sentence'])
current_sentence.rename({True: 'current_sentence_true', False: 'current_sentence_false'}, axis = 1, inplace = True)
current_sentence.drop('current_sentence_false', axis = 1, inplace = True)

trial_outcome = pd.get_dummies(df['trial_outcome'])
trial_outcome.rename({'Plead Guilty': 'plead_guilty', 'Found Guilty': 'found_guilty'}, axis = 1, inplace = True)
trial_outcome.drop('plead_guilty', axis = 1, inplace = True)

class_adj = pd.get_dummies(df['class_adj'])
class_adj.columns = ['m_class', 'other_class', 'x_class']
# class_adj.drop('other_class', axis = 1, inplace = True)

df = deepcopy(pd.concat([df, highest_charge, sentence_phase, sentence_type,
                         gender, race, current_sentence, trial_outcome, class_adj],
                        axis = 1, join = 'outer')).reset_index()

df.drop(['index', 'highest_charge', 'sentence_phase', 'sentence_type', 'gender',
         'race', 'current_sentence', 'trial_outcome', 'class_adj'], axis = 1, inplace = True)

df.rename({'highest_charge_true': 'highest_charge', 'current_sentence_true': 'current_sentence'}, axis = 1, inplace = True)

In [6]:
# checked for the number of occurances in each dummy variable.
print('highest_charge', df['highest_charge'].sum())
print('amended/corrected_sentence_phase', df['amended/corrected_sentence_phase'].sum())
print('original_sentence_phase', df['original_sentence_phase'].sum())
print('probation_violation_sentence_phase', df['probation_violation_sentence_phase'].sum())
print('remanded_sentence_phase', df['remanded_sentence_phase'].sum())
print('resentenced_sentence_phase', df['resentenced_sentence_phase'].sum())
print('conditional_discharge_sentence', df['conditional_discharge_sentence'].sum())
print('conversion_sentence', df['conversion_sentence'].sum())
print('boot_camp_sentence', df['boot_camp_sentence'].sum())
print('death_sentence', df['death_sentence'].sum())
print('jail_sentence', df['jail_sentence'].sum())
print('prison_sentence', df['prison_sentence'].sum())
print('probation_sentence', df['probation_sentence'].sum())
print('supervision_sentence', df['supervision_sentence'].sum())
print('male', df['male'].sum())
print('asian', df['asian'].sum())
print('biracial', df['biracial'].sum())
print('black', df['black'].sum())
print('hispanic', df['hispanic'].sum())
print('unknown_race', df['unknown_race'].sum())
print('white', df['white'].sum())
print('current_sentence', df['current_sentence'].sum())
print('found_guilty', df['found_guilty'].sum())
print('m_class', df['m_class'].sum())
print('other_class', df['other_class'].sum())
print('x_class', df['x_class'].sum())

highest_charge 119347
amended/corrected_sentence_phase 1035
original_sentence_phase 142262
probation_violation_sentence_phase 4154
remanded_sentence_phase 88
resentenced_sentence_phase 1164
conditional_discharge_sentence 1627
conversion_sentence 25
boot_camp_sentence 1757
death_sentence 2
jail_sentence 4317
prison_sentence 79529
probation_sentence 60351
supervision_sentence 1095
male 130806
asian 878
biracial 24615
black 99427
hispanic 2575
unknown_race 285
white 20923
current_sentence 143486
found_guilty 9559
m_class 644
other_class 137874
x_class 10185


In [7]:
df = deepcopy(df[(df['x_class'] == 1)])

In [8]:
df.to_pickle('../Pickles/df_modeling.pkl')