# Recidivism Project - Data Cleanup and  Machine Learning 
# Using Logistic Regression

In [1]:
import pandas as pd
import csv

In [2]:
filepath = "3-Year_Recidivism_for_Offenders_Released_from_Prison_in_Iowa.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Main Supervising District,Release Type,Race - Ethnicity,Age At Release,Sex,Offense Classification,Offense Type,Offense Subtype,Return to Prison,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population
0,2010,2013,7JD,Parole,Black - Non-Hispanic,25-34,Male,C Felony,Violent,Robbery,Yes,433.0,New,C Felony,Drug,Trafficking,Yes
1,2010,2013,,Discharged – End of Sentence,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,Yes,453.0,Tech,,,,No
2,2010,2013,5JD,Parole,White - Non-Hispanic,35-44,Male,B Felony,Drug,Trafficking,Yes,832.0,Tech,,,,Yes
3,2010,2013,6JD,Parole,White - Non-Hispanic,25-34,Male,B Felony,Other,Other Criminal,No,,No Recidivism,,,,Yes
4,2010,2013,,Discharged – End of Sentence,Black - Non-Hispanic,35-44,Male,D Felony,Violent,Assault,Yes,116.0,Tech,,,,No


In [3]:
df.shape

(26020, 17)

In [4]:
df.dtypes

Fiscal Year Released            int64
Recidivism Reporting Year       int64
Main Supervising District      object
Release Type                   object
Race - Ethnicity               object
Age At Release                 object
Sex                            object
Offense Classification         object
Offense Type                   object
Offense Subtype                object
Return to Prison               object
Days to Return                float64
Recidivism Type                object
New Offense Classification     object
New Offense Type               object
New Offense Sub Type           object
Target Population              object
dtype: object

In [5]:
# Split race and ethnicity into 2 separate columns

df[['Race', 'Ethnicity']] = df['Race - Ethnicity'].str.split("-",n=1,expand=True)
df.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Main Supervising District,Release Type,Race - Ethnicity,Age At Release,Sex,Offense Classification,Offense Type,Offense Subtype,Return to Prison,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population,Race,Ethnicity
0,2010,2013,7JD,Parole,Black - Non-Hispanic,25-34,Male,C Felony,Violent,Robbery,Yes,433.0,New,C Felony,Drug,Trafficking,Yes,Black,Non-Hispanic
1,2010,2013,,Discharged – End of Sentence,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,Yes,453.0,Tech,,,,No,White,Non-Hispanic
2,2010,2013,5JD,Parole,White - Non-Hispanic,35-44,Male,B Felony,Drug,Trafficking,Yes,832.0,Tech,,,,Yes,White,Non-Hispanic
3,2010,2013,6JD,Parole,White - Non-Hispanic,25-34,Male,B Felony,Other,Other Criminal,No,,No Recidivism,,,,Yes,White,Non-Hispanic
4,2010,2013,,Discharged – End of Sentence,Black - Non-Hispanic,35-44,Male,D Felony,Violent,Assault,Yes,116.0,Tech,,,,No,Black,Non-Hispanic


In [6]:
# Remove spaces from column names

df.columns = df.columns.str.strip()

In [7]:
# drop null values from specific columns

df.dropna(subset=['Sex', 'Race', 'Ethnicity'])

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Main Supervising District,Release Type,Race - Ethnicity,Age At Release,Sex,Offense Classification,Offense Type,Offense Subtype,Return to Prison,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population,Race,Ethnicity
0,2010,2013,7JD,Parole,Black - Non-Hispanic,25-34,Male,C Felony,Violent,Robbery,Yes,433.0,New,C Felony,Drug,Trafficking,Yes,Black,Non-Hispanic
1,2010,2013,,Discharged – End of Sentence,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,Yes,453.0,Tech,,,,No,White,Non-Hispanic
2,2010,2013,5JD,Parole,White - Non-Hispanic,35-44,Male,B Felony,Drug,Trafficking,Yes,832.0,Tech,,,,Yes,White,Non-Hispanic
3,2010,2013,6JD,Parole,White - Non-Hispanic,25-34,Male,B Felony,Other,Other Criminal,No,,No Recidivism,,,,Yes,White,Non-Hispanic
4,2010,2013,,Discharged – End of Sentence,Black - Non-Hispanic,35-44,Male,D Felony,Violent,Assault,Yes,116.0,Tech,,,,No,Black,Non-Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26015,2015,2018,,Paroled to Detainer - INS,White - Hispanic,Under 25,Male,C Felony,Violent,Assault,No,,No Recidivism,,,,Yes,White,Hispanic
26016,2015,2018,6JD,Released to Special Sentence,White - Non-Hispanic,35-44,Male,C Felony,Violent,Sex,No,,No Recidivism,,,,No,White,Non-Hispanic
26017,2015,2018,5JD,Parole Granted,White - Non-Hispanic,25-34,Female,Aggravated Misdemeanor,Public Order,Traffic,No,,No Recidivism,,,,No,White,Non-Hispanic
26018,2015,2018,5JD,Paroled w/Immediate Discharge,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,No,,No Recidivism,,,,Yes,White,Non-Hispanic


In [8]:
# Replace null values 

df['Main Supervising District'].fillna("NSD", inplace=True)
df

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Main Supervising District,Release Type,Race - Ethnicity,Age At Release,Sex,Offense Classification,Offense Type,Offense Subtype,Return to Prison,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population,Race,Ethnicity
0,2010,2013,7JD,Parole,Black - Non-Hispanic,25-34,Male,C Felony,Violent,Robbery,Yes,433.0,New,C Felony,Drug,Trafficking,Yes,Black,Non-Hispanic
1,2010,2013,NSD,Discharged – End of Sentence,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,Yes,453.0,Tech,,,,No,White,Non-Hispanic
2,2010,2013,5JD,Parole,White - Non-Hispanic,35-44,Male,B Felony,Drug,Trafficking,Yes,832.0,Tech,,,,Yes,White,Non-Hispanic
3,2010,2013,6JD,Parole,White - Non-Hispanic,25-34,Male,B Felony,Other,Other Criminal,No,,No Recidivism,,,,Yes,White,Non-Hispanic
4,2010,2013,NSD,Discharged – End of Sentence,Black - Non-Hispanic,35-44,Male,D Felony,Violent,Assault,Yes,116.0,Tech,,,,No,Black,Non-Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26015,2015,2018,NSD,Paroled to Detainer - INS,White - Hispanic,Under 25,Male,C Felony,Violent,Assault,No,,No Recidivism,,,,Yes,White,Hispanic
26016,2015,2018,6JD,Released to Special Sentence,White - Non-Hispanic,35-44,Male,C Felony,Violent,Sex,No,,No Recidivism,,,,No,White,Non-Hispanic
26017,2015,2018,5JD,Parole Granted,White - Non-Hispanic,25-34,Female,Aggravated Misdemeanor,Public Order,Traffic,No,,No Recidivism,,,,No,White,Non-Hispanic
26018,2015,2018,5JD,Paroled w/Immediate Discharge,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,No,,No Recidivism,,,,Yes,White,Non-Hispanic


In [None]:
#save as CSV file to be used by Tableau for visualizations

# df.to_csv('iowa_recidivism_cleaned.csv', index_label= "id") 

In [9]:
# Change 'No' to 0 and 'Yes' to '1' for machine learning model

df['Return to Prison Coded'] = df['Return to Prison'].map(lambda x: '1' if x == 'Yes' else '0')

In [10]:
df.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Main Supervising District,Release Type,Race - Ethnicity,Age At Release,Sex,Offense Classification,Offense Type,Offense Subtype,Return to Prison,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population,Race,Ethnicity,Return to Prison Coded
0,2010,2013,7JD,Parole,Black - Non-Hispanic,25-34,Male,C Felony,Violent,Robbery,Yes,433.0,New,C Felony,Drug,Trafficking,Yes,Black,Non-Hispanic,1
1,2010,2013,NSD,Discharged – End of Sentence,White - Non-Hispanic,25-34,Male,D Felony,Property,Theft,Yes,453.0,Tech,,,,No,White,Non-Hispanic,1
2,2010,2013,5JD,Parole,White - Non-Hispanic,35-44,Male,B Felony,Drug,Trafficking,Yes,832.0,Tech,,,,Yes,White,Non-Hispanic,1
3,2010,2013,6JD,Parole,White - Non-Hispanic,25-34,Male,B Felony,Other,Other Criminal,No,,No Recidivism,,,,Yes,White,Non-Hispanic,0
4,2010,2013,NSD,Discharged – End of Sentence,Black - Non-Hispanic,35-44,Male,D Felony,Violent,Assault,Yes,116.0,Tech,,,,No,Black,Non-Hispanic,1


# Machine Learning using Logistic Regression

In [None]:
# Update sklearn to prevent version mismatches
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 

!pip install sklearn --upgrade
!pip install joblib

In [11]:
# Set features for X and y values and encode categorical data

X = pd.get_dummies(df[['Main Supervising District','Age At Release',
                       'Sex', 'Offense Classification','Offense Type', 
                       'Race', 'Ethnicity']])

y = df["Return to Prison Coded"]

# Create a Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
X_train.head()

Unnamed: 0,Main Supervising District_1JD,Main Supervising District_2JD,Main Supervising District_3JD,Main Supervising District_4JD,Main Supervising District_5JD,Main Supervising District_6JD,Main Supervising District_7JD,Main Supervising District_8JD,Main Supervising District_ISC,Main Supervising District_Interstate Compact,...,Offense Type_Public Order,Offense Type_Violent,Race_American Indian or Alaska Native,Race_Asian or Pacific Islander,Race_Black,Race_N/A,Race_White,Ethnicity_,Ethnicity_ Hispanic,Ethnicity_ Non-Hispanic
10469,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
16080,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5216,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
19263,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
24390,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [14]:
X_train.columns

Index(['Main Supervising District_1JD', 'Main Supervising District_2JD',
       'Main Supervising District_3JD', 'Main Supervising District_4JD',
       'Main Supervising District_5JD', 'Main Supervising District_6JD',
       'Main Supervising District_7JD', 'Main Supervising District_8JD',
       'Main Supervising District_ISC',
       'Main Supervising District_Interstate Compact',
       'Main Supervising District_NSD', 'Age At Release_25-34',
       'Age At Release_35-44', 'Age At Release_45-54',
       'Age At Release_55 and Older', 'Age At Release_Under 25', 'Sex_Female',
       'Sex_Male', 'Offense Classification_A Felony',
       'Offense Classification_Aggravated Misdemeanor',
       'Offense Classification_B Felony', 'Offense Classification_C Felony',
       'Offense Classification_D Felony',
       'Offense Classification_Felony - Enhanced',
       'Offense Classification_Felony - Enhancement to Original Penalty',
       'Offense Classification_Felony - Mandatory Minimum',
 

# Pre-processing
Scale the data (standardize the features for normal distribution)

In [None]:
# Scale (and transform) data - using StandardScaler
#from sklearn.preprocessing import StandardScaler

# fit the training data only - not the test data
#X_scaler = StandardScaler().fit(X_train)

#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)


In [15]:
# Scale (and transform) data - using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [16]:
# Feature Selection using RFE (recursive feature elimination - to remove the weakest features)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selector = RFE(estimator=LogisticRegression(), n_features_to_select=50, step=1)
selector = selector.fit(X_train_scaled,y_train)
best_features = selector.support_



In [17]:
selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [18]:
best_features

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [19]:
# Create the model

model = LogisticRegression(solver='lbfgs', multi_class='auto', verbose=0, max_iter=500)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Train the Model

In [20]:
model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.6743018191135024
Testing Data Score: 0.6687163720215219


# Hyperparameter Tuning

In [21]:
# Create the GridSearch CV (cross-validation) classifier/estimator 
# along with a parameter object containing the values to adjust

from sklearn.model_selection import GridSearchCV
import numpy as np

solver = ['lbfgs']
penalty = ['l2']
C = np.logspace(0, 4, 50)

hyperparameters = dict(C=C, penalty=penalty, solver=solver)

grid = GridSearchCV(model, hyperparameters, cv=5, verbose=3)

# Train the model with GridSearch
# Fit the model using the grid search estimator. 
# This will take the model and try each combination of parameters

grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=1.0, penalty=l2, solver=lbfgs .................................

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



[CV] ..... C=1.0, penalty=l2, solver=lbfgs, score=0.669, total=   1.5s
[CV] C=1.0, penalty=l2, solver=lbfgs .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ..... C=1.0, penalty=l2, solver=lbfgs, score=0.666, total=   1.1s
[CV] C=1.0, penalty=l2, solver=lbfgs .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV] ..... C=1.0, penalty=l2, solver=lbfgs, score=0.664, total=   1.0s
[CV] C=1.0, penalty=l2, solver=lbfgs .................................
[CV] ..... C=1.0, penalty=l2, solver=lbfgs, score=0.680, total=   0.7s
[CV] C=1.0, penalty=l2, solver=lbfgs .................................
[CV] ..... C=1.0, penalty=l2, solver=lbfgs, score=0.675, total=   0.8s
[CV] C=1.2067926406393286, penalty=l2, solver=lbfgs ..................
[CV]  C=1.2067926406393286, penalty=l2, solver=lbfgs, score=0.669, total=   0.7s
[CV] C=1.2067926406393286, penalty=l2, solver=lbfgs ..................
[CV]  C=1.2067926406393286, penalty=l2, solver=lbfgs, score=0.666, total=   0.6s
[CV] C=1.2067926406393286, penalty=l2, solver=lbfgs ..................
[CV]  C=1.2067926406393286, penalty=l2, solver=lbfgs, score=0.664, total=   0.5s
[CV] C=1.2067926406393286, penalty=l2, solver=lbfgs ..................
[CV]  C=1.2067926406393286, penalty=l2, solver=lbfgs, score=0.680, total=   0.6s
[CV] C=1.2067926406393286, penalty=l2

[CV]  C=7.906043210907698, penalty=l2, solver=lbfgs, score=0.665, total=   1.0s
[CV] C=7.906043210907698, penalty=l2, solver=lbfgs ...................
[CV]  C=7.906043210907698, penalty=l2, solver=lbfgs, score=0.680, total=   0.9s
[CV] C=7.906043210907698, penalty=l2, solver=lbfgs ...................
[CV]  C=7.906043210907698, penalty=l2, solver=lbfgs, score=0.674, total=   1.0s
[CV] C=9.54095476349994, penalty=l2, solver=lbfgs ....................
[CV]  C=9.54095476349994, penalty=l2, solver=lbfgs, score=0.669, total=   1.1s
[CV] C=9.54095476349994, penalty=l2, solver=lbfgs ....................
[CV]  C=9.54095476349994, penalty=l2, solver=lbfgs, score=0.666, total=   1.2s
[CV] C=9.54095476349994, penalty=l2, solver=lbfgs ....................
[CV]  C=9.54095476349994, penalty=l2, solver=lbfgs, score=0.665, total=   1.0s
[CV] C=9.54095476349994, penalty=l2, solver=lbfgs ....................
[CV]  C=9.54095476349994, penalty=l2, solver=lbfgs, score=0.680, total=   0.9s
[CV] C=9.540954763



[CV]  C=35.564803062231285, penalty=l2, solver=lbfgs, score=0.680, total=   1.5s
[CV] C=35.564803062231285, penalty=l2, solver=lbfgs ..................
[CV]  C=35.564803062231285, penalty=l2, solver=lbfgs, score=0.674, total=   1.1s
[CV] C=42.91934260128776, penalty=l2, solver=lbfgs ...................
[CV]  C=42.91934260128776, penalty=l2, solver=lbfgs, score=0.669, total=   0.9s
[CV] C=42.91934260128776, penalty=l2, solver=lbfgs ...................
[CV]  C=42.91934260128776, penalty=l2, solver=lbfgs, score=0.666, total=   0.9s
[CV] C=42.91934260128776, penalty=l2, solver=lbfgs ...................
[CV]  C=42.91934260128776, penalty=l2, solver=lbfgs, score=0.665, total=   0.9s
[CV] C=42.91934260128776, penalty=l2, solver=lbfgs ...................
[CV]  C=42.91934260128776, penalty=l2, solver=lbfgs, score=0.680, total=   1.0s
[CV] C=42.91934260128776, penalty=l2, solver=lbfgs ...................
[CV]  C=42.91934260128776, penalty=l2, solver=lbfgs, score=0.674, total=   1.0s
[CV] C=51.79

[CV]  C=281.1768697974228, penalty=l2, solver=lbfgs, score=0.680, total=   0.9s
[CV] C=281.1768697974228, penalty=l2, solver=lbfgs ...................
[CV]  C=281.1768697974228, penalty=l2, solver=lbfgs, score=0.674, total=   1.2s
[CV] C=339.3221771895326, penalty=l2, solver=lbfgs ...................
[CV]  C=339.3221771895326, penalty=l2, solver=lbfgs, score=0.669, total=   0.9s
[CV] C=339.3221771895326, penalty=l2, solver=lbfgs ...................
[CV]  C=339.3221771895326, penalty=l2, solver=lbfgs, score=0.666, total=   0.8s
[CV] C=339.3221771895326, penalty=l2, solver=lbfgs ...................
[CV]  C=339.3221771895326, penalty=l2, solver=lbfgs, score=0.665, total=   1.3s
[CV] C=339.3221771895326, penalty=l2, solver=lbfgs ...................
[CV]  C=339.3221771895326, penalty=l2, solver=lbfgs, score=0.680, total=   0.7s
[CV] C=339.3221771895326, penalty=l2, solver=lbfgs ...................
[CV]  C=339.3221771895326, penalty=l2, solver=lbfgs, score=0.674, total=   1.0s
[CV] C=409.491



[CV]  C=596.3623316594643, penalty=l2, solver=lbfgs, score=0.674, total=   1.6s
[CV] C=719.6856730011514, penalty=l2, solver=lbfgs ...................
[CV]  C=719.6856730011514, penalty=l2, solver=lbfgs, score=0.669, total=   1.3s
[CV] C=719.6856730011514, penalty=l2, solver=lbfgs ...................
[CV]  C=719.6856730011514, penalty=l2, solver=lbfgs, score=0.666, total=   0.7s
[CV] C=719.6856730011514, penalty=l2, solver=lbfgs ...................
[CV]  C=719.6856730011514, penalty=l2, solver=lbfgs, score=0.665, total=   0.9s
[CV] C=719.6856730011514, penalty=l2, solver=lbfgs ...................
[CV]  C=719.6856730011514, penalty=l2, solver=lbfgs, score=0.680, total=   0.8s
[CV] C=719.6856730011514, penalty=l2, solver=lbfgs ...................
[CV]  C=719.6856730011514, penalty=l2, solver=lbfgs, score=0.674, total=   0.6s
[CV] C=868.511373751352, penalty=l2, solver=lbfgs ....................
[CV]  C=868.511373751352, penalty=l2, solver=lbfgs, score=0.669, total=   1.1s
[CV] C=868.5113



[CV]  C=2222.996482526193, penalty=l2, solver=lbfgs, score=0.669, total=   1.5s
[CV] C=2222.996482526193, penalty=l2, solver=lbfgs ...................
[CV]  C=2222.996482526193, penalty=l2, solver=lbfgs, score=0.666, total=   0.9s
[CV] C=2222.996482526193, penalty=l2, solver=lbfgs ...................
[CV]  C=2222.996482526193, penalty=l2, solver=lbfgs, score=0.665, total=   1.2s
[CV] C=2222.996482526193, penalty=l2, solver=lbfgs ...................
[CV]  C=2222.996482526193, penalty=l2, solver=lbfgs, score=0.680, total=   1.0s
[CV] C=2222.996482526193, penalty=l2, solver=lbfgs ...................
[CV]  C=2222.996482526193, penalty=l2, solver=lbfgs, score=0.674, total=   0.8s
[CV] C=2682.6957952797247, penalty=l2, solver=lbfgs ..................
[CV]  C=2682.6957952797247, penalty=l2, solver=lbfgs, score=0.669, total=   1.1s
[CV] C=2682.6957952797247, penalty=l2, solver=lbfgs ..................
[CV]  C=2682.6957952797247, penalty=l2, solver=lbfgs, score=0.666, total=   0.8s
[CV] C=2682.



[CV] . C=10000.0, penalty=l2, solver=lbfgs, score=0.665, total=   1.5s
[CV] C=10000.0, penalty=l2, solver=lbfgs .............................
[CV] . C=10000.0, penalty=l2, solver=lbfgs, score=0.680, total=   1.0s
[CV] C=10000.0, penalty=l2, solver=lbfgs .............................
[CV] . C=10000.0, penalty=l2, solver=lbfgs, score=0.674, total=   0.6s


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  4.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.000000...
       4.09491506e+02, 4.94171336e+02, 5.96362332e+02, 7.19685673e+02,
       8.68511374e+02, 1.04811313e+03, 1.26485522e+03, 1.52641797e+03,
       1.84206997e+03, 2.22299648e+03, 2.68269580e+03, 3.23745754e+03,
       3.90693994e+03, 4.71486636e+03, 5.68986603e+03, 6.86648845e+03,
       8.

In [22]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 2.120950887920191, 'penalty': 'l2', 'solver': 'lbfgs'}
0.6709710479118627


In [None]:
# the weigth coefficients is the slope
# the y-intercept is the 'b' in (y = mx +b) - where the value crosses the y-axis

print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

In [None]:
# convert the column names to a list for creating the dictionary below

column_list = X_train.columns.tolist()
column_list

In [None]:
# loop through the column names and the intercepts to create a list for using later in Tableau

model_list = []

for i in range(len(column_list)):
    model_dict = {"Model" : "LR",
                  "Feature" : column_list[i],
                  "Coeff": model.coef_[0][i]}
    model_list.append(model_dict)

model_list

In [None]:
# Create a list of dictionaries of the 3 models and their accuracy score

accuracy_list = []

accuracy_dict = {"Model" : "LR", "Score": model.score(X_test_scaled, y_test)}
accuracy_list.append(accuracy_dict)      
                 
accuracy_dict = {"Model" : "SVM","Score": model.score(X_test_scaled, y_test)}
accuracy_list.append(accuracy_dict)    
                 
accuracy_dict = {"Model" : "NB", "Score": model.score(X_test_scaled, y_test)}
accuracy_list.append(accuracy_dict)

accuracy_list

In [None]:
# turn the model list and accuracy lists into dataframes

model_df = pd.DataFrame(model_list)
model_df


In [None]:
accuracy_df = pd.DataFrame(accuracy_list)
accuracy_df

In [None]:
# write the dataframes to csv files to be brought in to Tableau

model_df.to_csv('modelcoeff.csv', index=True)
accuracy_df.to_csv('modelaccuracy.csv', index=True)

#   <font color= green >  Machine Learning using Support Vector Machine (SVM) </font>


In [28]:
import numpy as np
import matplotlib.pyplot as plt

In [29]:
df.columns

Index(['Fiscal Year Released', 'Recidivism Reporting Year',
       'Main Supervising District', 'Release Type', 'Race - Ethnicity',
       'Age At Release', 'Sex', 'Offense Classification', 'Offense Type',
       'Offense Subtype', 'Return to Prison', 'Days to Return',
       'Recidivism Type', 'New Offense Classification', 'New Offense Type',
       'New Offense Sub Type', 'Target Population', 'Race', 'Ethnicity',
       'Return to Prison Coded'],
      dtype='object')

In [35]:
df.shape
df.size
df['Return to Prison Coded'].value_counts()

0    17339
1     8681
Name: Return to Prison Coded, dtype: int64

In [38]:
df.dtypes

Fiscal Year Released            int64
Recidivism Reporting Year       int64
Main Supervising District      object
Release Type                   object
Race - Ethnicity               object
Age At Release                 object
Sex                            object
Offense Classification         object
Offense Type                   object
Offense Subtype                object
Return to Prison               object
Days to Return                float64
Recidivism Type                object
New Offense Classification     object
New Offense Type               object
New Offense Sub Type           object
Target Population              object
Race                           object
Ethnicity                      object
Return to Prison Coded         object
dtype: object

In [37]:
#distribution of the class
return_df=df[df['Return to Prison Coded']==1][0:200]
not_returned_df=df[df['Return to Prison Coded']==0][0:200]

return_df.plot(kind = 'scatter', x =)

In [45]:
# Set features for X and y values and encode categorical data

feature_df = pd.get_dummies(df[['Main Supervising District','Age At Release',
                       'Sex', 'Offense Classification','Offense Type', 
                       'Offense Subtype','Race', 'Ethnicity']])

#independent variable
X = np.asarray(feature_df)

#dependent variable
y = np.array(df["Return to Prison Coded"])


In [47]:
X[0:5]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1],
    

In [48]:
y[0:5]

array(['1', '1', '1', '0', '1'], dtype=object)

In [54]:
#divide data into Train/Test Data
# train_test_split(X,y,test_size=0.2, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#19515 x 72
X_train.shape

y_train.shape

y_test.shape

(6505,)

In [52]:
# Model SVM
from sklearn import svm

classifier = svm.SVC(kernel = 'linear', gamma='auto', C = 2)
classifier.fit(X_train,y_train)

y_predict = classifier.predict(X_test)

In [53]:
from sklearn.metrics import classification_report

#Evaluate results
print(classification_report(y_test,y_predict))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.66      1.00      0.80      4307
           1       0.00      0.00      0.00      2198

    accuracy                           0.66      6505
   macro avg       0.33      0.50      0.40      6505
weighted avg       0.44      0.66      0.53      6505

