In [45]:
! pip install scikit-optimize

Collecting scikit-optimize
  Using cached scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
Collecting pyaml>=16.9
  Using cached pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [49]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.inspection import permutation_importance

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

from xgboost import XGBClassifier


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import Precision

### Import
All data besides PUMA and ID are scaled. Remove ID for modeling

In [68]:
X_train = pd.read_csv('cleaned_data/X_train_final.csv',index_col = 'Unnamed: 0')
X_test = pd.read_csv('cleaned_data/X_test_final.csv',index_col = 'Unnamed: 0')

y_train = pd.read_csv('cleaned_data/y_train.csv',index_col = 'Unnamed: 0')
y_test = pd.read_csv('cleaned_data/y_test.csv',index_col = 'Unnamed: 0')

In [69]:
X_train = X_train.drop(columns = 'id')
X_test = X_test.drop(columns = 'id')

y_train = y_train['recidivism_within_3years']
y_test = y_test['recidivism_within_3years']

# Data Transformation
Polynomial Features, ???

In [26]:
poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

X_train_poly = pd.DataFrame(data = X_train_poly, columns = poly.get_feature_names_out(), index = X_train.index)
X_test_poly = pd.DataFrame(data = X_test_poly, columns = poly.get_feature_names_out(), index = X_test.index)


(15429, 2145)

# Modeling
- What metrics do we want to be watching? Precision to reduce false negatives?
    - Judging Criteria:
        - (1) the accuracy of their recidivism forecasts for males and females, and the average of these two accuracy scores, 
        - (2) the fairness of their recidivism forecast accuracy when accounting for racial bias between Black and white individuals on parole, for both males and females.To measure model accuracy, NIJ calculated the mean squared error of entries using the following Brier score
 
        
        
        FPR = FP/(FP+TN)
    
- Figure out a way to store different model metrics to compare between each and find the best
- PCA, ~~VIF~~, Permuation Importance 
- Run models on:
    1. The overall population
    2. White and Black individuals
    3. Females and males
    4. Each five-year age group between ages 18 and 48+
    5. A mutually exclusive combination of models 2 and 3
    6. A mutually exclusive combination of models 2 and 4
    7. A mutually exclusive combination of models 3 and 4
    8. A mutually exclusive combination of models 2, 3, and 4


### Baseline
- About the same distributionn of True and False values between the train and test target values. May need to account for the unbalanced data in our modeling

In [100]:
print('train:\n',y_train.value_counts(normalize = True))
print('test:\n',y_test.value_counts(normalize = True))

train:
 1    0.588697
0    0.411303
Name: recidivism_within_3years, dtype: float64
test:
 1    0.591322
0    0.408678
Name: recidivism_within_3years, dtype: float64


### Logistic Regression
- Try out GridSearch/BayesSearch, Optimize Model, L1 (lasso) Penalty with Polynomail Features

In [27]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train,y_train)
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))

0.702119385572623
0.6947964812882064


In [None]:
params = {}

In [None]:
bayes = BayesSearchCV(lr,
                      search_spaces = {'penalty': Categorical(['l1', 'l2']),
                                        'solver': Categorical(['saga', 'liblinear']),
                                        "C": Real(1e-6, 10, prior='log-uniform')},
                     cv= 5,
                     n_jobs = -1)
bayes.fit(X_train,y_train)

In [90]:
bayes.score(X_train,y_train)
bayes.predict_proba(X_test)[:]

array([[0.70921997, 0.29078003],
       [0.50881424, 0.49118576],
       [0.52281508, 0.47718492],
       ...,
       [0.51360427, 0.48639573],
       [0.3650297 , 0.6349703 ],
       [0.58180874, 0.41819126]])

### Logistic Regression
- L1 (lasso) Penalty with Polynomail Features

In [28]:
lr = LogisticRegression(penalty='l1',solver = 'liblinear', max_iter=10000, class_weight='balanced')
lr.fit(X_train_poly,y_train)
print(lr.score(X_train_poly,y_train))
print(lr.score(X_test_poly,y_test))

0.7607751636528615
0.6839123304010735


### Random Forest Classifier 
- optimize parameters

In [133]:
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train,y_train)
print(rf.score(X_train,y_train))
print(rf.score(X_test,y_test))

0.7016008814569966
0.6937527955867004


### XGBoost
Optimizer parameters

In [143]:
xgb = XGBClassifier(max_depth=2)
xgb.fit(X_train,y_train)
print(xgb.score(X_train,y_train))
print(xgb.score(X_test,y_test))

0.7390628038110052
0.7250633666318771


### NN

Try to optimize Layers, Add Early Stopping

In [None]:
model = Sequential()
model.add(Dense(32,input_shape = (X_train.shape[1],), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'bce',optimizer = 'adam', metrics=['accuracy',Precision()] )

history = model.fit(x = X_train, y= y_train['history_df.filter(like='precision').plot(xlabel = 'epoch')'],validation_data=(X_test,y_test['recidivism_within_3years']),epochs=40)

In [37]:
history_df = pd.DataFrame(history.history,index = history.epoch)

NameError: name 'history' is not defined