# Machine Learning Models
## By Chaitanya Vallabhaneni

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv('data/propublica_data_for_fairml.csv')
X = data.iloc[:, [1,2,3,4,5,6,7,8,9,10,11]].values
y = data.iloc[:, 0].values

In [3]:
data = [(X,y)]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [7]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),LogisticRegression()]

In [8]:
for ds_cnt, ds in enumerate(data):

    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=.3, random_state=42)

In [9]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA","LogisticRegression"]

In [10]:
for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(name,score)

Nearest Neighbors 0.6123110151187905
Linear SVM 0.652267818574514
RBF SVM 0.6641468682505399
Decision Tree 0.6754859611231101
Random Forest 0.6673866090712743
Neural Net 0.6749460043196545
AdaBoost 0.66792656587473
Naive Bayes 0.6241900647948164
QDA 0.6452483801295896
LogisticRegression 0.6652267818574514


In [11]:
from scipy.stats import randint as sp_randint
search_parameters_space_random = {"max_depth": [3, None],
              "max_features": sp_randint(1, 9),
              "min_samples_leaf": sp_randint(1, 9),
              "criterion": ["gini","entropy"]}

In [12]:
random_search = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(), 
    param_distributions=search_parameters_space_random,
   scoring="roc_auc", n_jobs=-1, 
    n_iter=50)

In [13]:
%%time
random_search.fit(X_train, y_train)

Wall time: 1.68 s


RandomizedSearchCV(estimator=DecisionTreeClassifier(), n_iter=50, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019A49FF37C0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019A49FDC1C0>},
                   scoring='roc_auc')

In [14]:
random_search.best_score_

0.718221593603151

In [15]:
random_search.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_features=6, min_samples_leaf=8)

In [16]:
random_search.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 8}

In [17]:
y_pred = random_search.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71      1020
           1       0.65      0.57      0.61       832

    accuracy                           0.67      1852
   macro avg       0.67      0.66      0.66      1852
weighted avg       0.67      0.67      0.67      1852

