### Data Analysis

Criminal Justice - Survey of Inmates

In [1]:
from __future__ import division

import pandas as pd
import numpy as np
import copy

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from patsy import dmatrices

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

pd.set_option('max_columns', 100)

In [2]:
with open('inmate_clean.csv') as datafile:
    data4 = pd.read_csv(datafile)

> Consolidate HS degree data.

In [3]:
data4['degree'] = data4[['has_HSD', 'has_GED']].max(axis=1)

degree = data4[data4['degree'] == 1]
nondegree = data4[data4['degree'] != 1]

> Pre-process data

In [4]:
data_num = data4[['ntimes_arrested', 'age_arrest_first', 'ntimes_incarc_adult', 
                  'ntimes_incarc_juv', 'highest_grade_attend', 'monthly_income_prior', 
                  'age_crime_first', 'age_drink_first', 'drink_frequency']]

min_max_scaler = MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(data_num)

preproc_data = pd.DataFrame(data_scaled)

preproc_data.columns = data_num.columns

In [5]:
data4.index = data4['unique_id']

preproc_data.index = data4['unique_id']

> Combine pre-processed and categorical columns

In [6]:
cat_cols = ['violent', 'crime_type', 'gender', 'race', 'marital_stat', 'is_military', 
            'violent_first', 'probation_yn', 'degree', 'had_job', 
            'public_assist_prior', 'family_makeup', 'child_caretaker_welfare', 
            'child_caretaker_substabuse', 'parents_servetime', 'family_servetime', 
            'commit_crime_juv', 'forcible_sex', 'phys_abuse', 'used_heroin', 'used_meth', 
            'used_barbiturates', 'used_crack', 'used_cocaine', 'used_pcp', 
            'used_ecstacy', 'used_lsd', 'used_pot']

for col in cat_cols:
    preproc_data[col] = data4[col]

> Define functions for running various models

In [7]:
def get_scores(model_dict):
    for mname, m in model_dict.iteritems():
        print "*** %s" % mname
        m.fit(X_tr, y_tr)
        preds = m.predict(X_ts)
        proba = m.predict_proba(X_ts)
        print 'accuracy: %f' % accuracy_score(y_ts, preds)
        print 'precision: %f' % precision_score(y_ts, preds)
        print 'recall: %f' % recall_score(y_ts, preds)
        print 'f1 score: %f' % f1_score(y_ts, preds)
        print '\n'
        all_preds[mname] = preds
        all_proba[mname] = proba

In [8]:
def get_scores_coef(model_dict):
    for mname, m in model_dict.iteritems():
        print "*** %s" % mname
        m.fit(X_tr, y_tr)
        preds = m.predict(X_ts)
        proba = m.predict_proba(X_ts)
        print 'accuracy: %f' % accuracy_score(y_ts, preds)
        print 'precision: %f' % precision_score(y_ts, preds)
        print 'recall: %f' % recall_score(y_ts, preds)
        print 'f1 score: %f' % f1_score(y_ts, preds)
        print '\n'
        coefs = sorted(zip(m.coef_[0], X_tr.columns))
        for coef in coefs:
            print '%.05f \t%s' % (coef)
        all_preds[mname] = preds
        all_proba[mname] = proba
        return coefs

In [9]:
def get_crossval_scores(X, y, model_dict):
    print 'CROSS VALIDATION SCORES'
    for mname, m in model_dict.iteritems():
        print '\n*** %s' % mname
        acc = np.mean(cross_val_score(m, X, y, scoring='accuracy'))
        pre = np.mean(cross_val_score(m, X, y, scoring='precision'))
        rec = np.mean(cross_val_score(m, X, y, scoring='recall'))
        f1 = np.mean(cross_val_score(m, X, y, scoring='f1'))
        print 'cv score: %f' % np.mean(cross_val_score(m, X, y))
        print 'accuracy: %f' % acc
        print 'precision: %f' % pre
        print 'recall: %f' % rec
        print 'f1 score: %f' % f1

> Create matrices, split into test and train, define models

In [10]:
columns = preproc_data.columns.difference(['violent', 'crime_type'])

formula = 'violent ~ ' + ' + '.join(columns)

y, X = dmatrices(formula, data=preproc_data, return_type='dataframe')

X = X.iloc[:,1:]
y = y.iloc[:,0]

In [11]:
rs = 11

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.25, random_state=rs)

models = {
          'logistic': LogisticRegression(),
          'gauss naive bayes': GaussianNB(),
          'SVM': SVC(random_state=rs, probability=True),
          'decision tree': DecisionTreeClassifier(random_state=rs),
          'random forest': RandomForestClassifier(random_state=rs)
         }

> Run models with all features

In [12]:
all_preds = {}
all_proba = {}

get_scores(models)

*** gauss naive bayes
accuracy: 0.615802
precision: 0.648533
recall: 0.759430
f1 score: 0.699614


*** SVM
accuracy: 0.657778
precision: 0.663399
recall: 0.850796
f1 score: 0.745501


*** random forest
accuracy: 0.626667
precision: 0.694222
recall: 0.654652
f1 score: 0.673857


*** logistic
accuracy: 0.662222
precision: 0.684287
recall: 0.792121
f1 score: 0.734266


*** decision tree
accuracy: 0.573333
precision: 0.639289
recall: 0.632858
f1 score: 0.636057




> Run SVM linear model and get coefficients

In [13]:
svc1 = get_scores_coef({'SVC linear': SVC(kernel='linear', 
                        gamma=2, random_state=rs, probability=True)})

*** SVC linear
accuracy: 0.666667
precision: 0.682910
recall: 0.810562
f1 score: 0.741280


-3.63631 	age_arrest_first
-2.82375 	ntimes_arrested
-2.54133 	ntimes_incarc_adult
-2.08546 	age_drink_first
-1.17099 	age_crime_first
-0.88073 	race[T.race_unknown]
-0.73471 	marital_stat[T.separated]
-0.72184 	highest_grade_attend
-0.60721 	used_meth
-0.55426 	race[T.other]
-0.54149 	monthly_income_prior
-0.52216 	used_crack
-0.50295 	marital_stat[T.married]
-0.48208 	probation_yn
-0.45462 	family_makeup[T.friends]
-0.43165 	race[T.white]
-0.40341 	used_ecstacy
-0.36819 	race[T.black]
-0.35742 	marital_stat[T.never_marr]
-0.24266 	used_cocaine
-0.23259 	family_makeup[T.institution]
-0.21713 	race[T.native_am]
-0.20026 	family_makeup[T.family_unknown]
-0.07614 	family_makeup[T.mother]
-0.07356 	used_heroin
-0.06740 	ntimes_incarc_juv
-0.06496 	marital_stat[T.mar_unknown]
-0.06465 	used_pot
-0.00163 	commit_crime_juv
0.00963 	child_caretaker_substabuse
0.01053 	family_servetime
0.03443 	family_m

> Run cross-validation on all models

In [14]:
get_crossval_scores(X, y, models)

CROSS VALIDATION SCORES

*** gauss naive bayes
cv score: 0.621807
accuracy: 0.621807
precision: 0.646979
recall: 0.763810
f1 score: 0.699159

*** SVM
cv score: 0.643909
accuracy: 0.643909
precision: 0.646432
recall: 0.851039
f1 score: 0.733616

*** random forest
cv score: 0.611928
accuracy: 0.611928
precision: 0.668705
recall: 0.647445
f1 score: 0.657571

*** logistic
cv score: 0.657615
accuracy: 0.657615
precision: 0.673745
recall: 0.793605
f1 score: 0.727629

*** decision tree
cv score: 0.578960
accuracy: 0.578960
precision: 0.635171
recall: 0.633943
f1 score: 0.634228


> Remove features and rerun models

In [15]:
columns = preproc_data.columns.difference(['violent', 'crime_type', 'race', 
                                           'family_makeup', 'ntimes_incarc_juv', 
                                           'commit_crime_juv', 'family_servetime', 
                                           'marital_stat', 
                                           'child_caretaker_substabuse', 'used_pot', 
                                           'used_heroin', 'used_barbiturates'])

formula = 'violent ~ ' + ' + '.join(columns)

y, X = dmatrices(formula, data=preproc_data, return_type='dataframe')

X = X.iloc[:,1:]
y = y.iloc[:,0]

rs = 11

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.25, random_state=rs)

In [16]:
get_scores(models)

*** gauss naive bayes
accuracy: 0.636543
precision: 0.664744
recall: 0.772842
f1 score: 0.714729


*** SVM
accuracy: 0.653333
precision: 0.658900
recall: 0.853311
f1 score: 0.743608


*** random forest
accuracy: 0.621728
precision: 0.692864
recall: 0.642917
f1 score: 0.666957


*** logistic
accuracy: 0.669630
precision: 0.689855
recall: 0.797988
f1 score: 0.739992


*** decision tree
accuracy: 0.594074
precision: 0.656540
recall: 0.652137
f1 score: 0.654331




In [17]:
coef = get_scores_coef({'SVC linear': SVC(kernel='linear', 
                        gamma=2, random_state=rs, probability=True)})

*** SVC linear
accuracy: 0.663210
precision: 0.677062
recall: 0.818944
f1 score: 0.741275


-3.57321 	age_arrest_first
-3.01791 	ntimes_incarc_adult
-2.87206 	ntimes_arrested
-1.69921 	age_drink_first
-0.85247 	age_crime_first
-0.61916 	highest_grade_attend
-0.57000 	used_meth
-0.55675 	used_crack
-0.51508 	monthly_income_prior
-0.47091 	probation_yn
-0.44895 	used_ecstacy
-0.24125 	used_cocaine
0.04495 	parents_servetime
0.04612 	child_caretaker_welfare
0.10883 	public_assist_prior
0.12037 	had_job
0.20599 	used_lsd
0.21434 	used_pcp
0.22713 	degree
0.26300 	drink_frequency
0.35507 	violent_first
0.40362 	phys_abuse
0.41518 	forcible_sex
0.45979 	is_military
0.80428 	gender[T.male]


In [18]:
coef2 = get_scores_coef({'logistic': LogisticRegression()})

*** logistic
accuracy: 0.669630
precision: 0.689855
recall: 0.797988
f1 score: 0.739992


-2.74764 	age_arrest_first
-2.26130 	ntimes_arrested
-1.68176 	age_drink_first
-1.62440 	ntimes_incarc_adult
-0.86762 	age_crime_first
-0.61393 	highest_grade_attend
-0.52084 	monthly_income_prior
-0.51422 	probation_yn
-0.50337 	used_meth
-0.43266 	used_crack
-0.39191 	used_ecstacy
-0.28496 	used_cocaine
0.08997 	child_caretaker_welfare
0.09810 	had_job
0.10165 	parents_servetime
0.12077 	public_assist_prior
0.17942 	used_pcp
0.22626 	used_lsd
0.25575 	degree
0.32188 	violent_first
0.33671 	drink_frequency
0.41144 	phys_abuse
0.48504 	forcible_sex
0.53612 	is_military
0.75184 	gender[T.male]


> Examine inmates with / without HS degree

In [19]:
data4['degree'].mean()

0.5792073095443881

In [20]:
data4['degree'] = data4[['has_HSD', 'has_GED']].max(axis=1)

degree = data4[data4['degree'] == 1]
nondegree = data4[data4['degree'] != 1]

In [21]:
countsd = pd.DataFrame([degree.crime_type.value_counts(), 
                        nondegree.crime_type.value_counts()])

countsd.columns = ['violent', 'drug', 'property', 'public_order', 'other']

for col in countsd.columns:
    print '%s: %f' % (col, countsd[col].sum() / len(data4))

violent: 0.576121
drug: 0.193851
property: 0.153599
public_order: 0.060131
other: 0.016298


In [22]:
share_crime = []
share_degree = []

for col in countsd.columns:
    c = countsd[col].sum() / len(data4)
    d = countsd[col][0] / (countsd[col][0] + countsd[col][1])
    share_crime.append(c)
    share_degree.append(d)

In [23]:
hw_df = pd.DataFrame([share_crime, share_degree])

hw_df.columns = countsd.columns

hw_df.index = ['h', 'w']

hw_df = hw_df.transpose()

In [24]:
hw_df.to_csv('tree_mappy.csv', index=False)