# Testing out algorithms in python

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('train.csv')
print(train.shape)

(3000, 17)


In [None]:
train.SubjectRace.value_counts(normalize=True)

# Importing the cleaned up dataset from R

## Setting up the Dataset

In [4]:
clean_merged = pd.read_csv('/Users/Kenny/Dropbox/Stats-101C-Kaggle/Datasets/cleaned_merged1.csv') 
print(clean_merged.shape)
#print(clean_merged1.columns)
#clean_merged1.head()

(3000, 37)


In [None]:
clean_merged1 = clean_merged[clean_merged.Fatal != 'U']
features_list = ['Fatal','SubjectRace', 'SubjectGender', 'SubjectArmed',
                               'ShotsClean', 'AgeGroup', 'NumberOfOfficers',
                                   'Month', 'Day', 'estimate2016', 'mainOfficerRace']
clean_merged1 = clean_merged1[features_list]
clean_merged1.shape
#clean_merged1.loc[clean_merged1.AgeGroup==0, 'AgeGroup'] = 5
#clean_merged1.dtypes

In [None]:
from sklearn.model_selection import train_test_split

features = pd.get_dummies(clean_merged1.iloc[:,1:])
labels = clean_merged1.Fatal

train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                                            labels, 
                                                                            test_size=.2,
                                                                            random_state=1255)
print('train_features dimensions:', train_features.shape)
print('test_features dimensions:', test_features.shape)
print('train_labels dimensions:', train_labels.shape)
print('test_labels dimensions:', test_labels.shape)

### This will be the value to beat (the just say no method): $66.9\%$

In [None]:
clean_merged1.Fatal.value_counts(normalize=True)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# xgboost causing warnings
import warnings
warnings.filterwarnings('ignore')

In [34]:
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(("SVC", SVC(C=100, gamma=0.1)))
models.append(("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=5)))
models.append(("RandomForestClassifier_gini", RandomForestClassifier(n_estimators=300,
                                                                criterion='gini',
                                                                max_depth=7)))
models.append(("RandomForestClassifier_entropy", RandomForestClassifier(n_estimators=300,
                                                                criterion = 'entropy',
                                                                max_depth=7)))
#models.append(('ExtremeRandomForest_gini', ExtraTreesClassifier(n_estimators = 200,
#                                                          max_depth = 7,
#                                                               criterion='gini')))
#models.append(('ExtremeRandomForest_entropy', ExtraTreesClassifier(n_estimators = 200,
#                                                          max_depth = 7,
#                                                               criterion='entropy')))
models.append(('XGBoost', XGBClassifier(learning_rate=0.1, n_estimators=100,
                                       max_depth= 5)))
models.append(('Adaboost', AdaBoostClassifier(learning_rate=0.1, n_estimators=100)))

results = []
names = []
for name,model in models:
    result = cross_val_score(model, train_features, train_labels, cv=5)
    names.append(name)
    results.append(result)

mean_results = []
sd_results = []
for model in results:
    mean_results.append(model.mean())
    sd_results.append(model.std())

results_df = pd.DataFrame({'Model': names,
              'Mean Classification Rate': mean_results,
              'Standard Deviation': sd_results
             })
results_df = results_df.iloc[:,[1,0,2]]

In [35]:
results_df.sort_values(by = 'Mean Classification Rate', ascending=False)

Unnamed: 0,Model,Mean Classification Rate,Standard Deviation
3,RandomForestClassifier_gini,0.691286,0.008531
4,RandomForestClassifier_entropy,0.690845,0.008333
5,XGBoost,0.68995,0.01535
6,Adaboost,0.687276,0.007304
0,LogisticRegression,0.660143,0.000286
2,KNeighborsClassifier,0.646321,0.023089
1,SVC,0.623655,0.02986


## SVC Hyperparameter Tuning (performs poorly)

In [11]:
from sklearn.model_selection import GridSearchCV
svc = SVC(random_state=12345)
param_grid_svc = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma' : [0.001, 0.01, 0.1, 1],
}

CV_svc = GridSearchCV(n_jobs=-1, estimator=svc, param_grid = param_grid_svc, cv=5,
                     verbose=1, return_train_score=True)
CV_svc.fit(train_features,train_labels)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=12345, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [12]:
mean_test = pd.DataFrame({'Accuracy': CV_svc.cv_results_['mean_test_score']})
params = pd.DataFrame(CV_svc.cv_results_['params'])
accuracy_result = params.join(mean_test).sort_values('Accuracy', ascending=False)
accuracy_result.head()

Unnamed: 0,C,gamma,Accuracy
16,10.0,0.001,0.677046
12,1.0,0.001,0.674822
13,1.0,0.01,0.672598
17,10.0,0.01,0.66548
14,1.0,0.1,0.661477


In [30]:
svc2= LinearSVC()
svc2.fit(train_features,train_labels)
svc2_predictions = svc2.predict(test_features)
print('SVC with Optimized Hyperparameters Accuracy:', accuracy_score(svc2_predictions, test_labels))

SVC with Optimized Hyperparameters Accuracy: 0.7087033747779752


## Random Forest Hyperparameter Tuning

In [None]:
rfc = RandomForestClassifier(random_state=12345)

param_grid_rf = {
    'n_estimators': [100,200,300,400,500],
    'max_features': list(range(1, 15))
    ,'max_depth': list(range(1, 11))  
}

CV_rfc = GridSearchCV(n_jobs = -1, estimator=rfc, param_grid = param_grid_rf, cv=5,
                      verbose=1, return_train_score=True)
CV_rfc.fit(train_features,train_labels)

In [None]:
mean_test = pd.DataFrame({'Accuracy': CV_rfc.cv_results_['mean_test_score']})
params = pd.DataFrame(CV_rfc.cv_results_['params']) 
accuracy_result = params.join(mean_test).sort_values('Accuracy', ascending=False) #display top 10 results
accuracy_result.head(10)

In [None]:
rfc2 = RandomForestClassifier(n_estimators=400, max_depth=7, max_features=11)
rfc2.fit(train_features, train_labels)
rfc2_predictions = rfc2.predict(test_features)
print('Random Forest with Optimized Parameters Accuracy:', accuracy_score(rfc2_predictions, test_labels))

## XGBoost Hyperparameter Tuning

In [None]:
xgb = XGBClassifier(random_state=12345)
#xgb.fit(train_features,train_labels, verbose=True)

param_grid_xgb = {
    'learning_rate': [0.1],
    'n_estimators': list(range(1,102,5)),
    'max_depth': list(range(2,15))
}
CV_xgb = GridSearchCV(n_jobs=-1, estimator=xgb, param_grid = param_grid_xgb, cv=5,
                     verbose=1, return_train_score=True)
CV_xgb.fit(train_features,train_labels)

In [784]:
mean_test = pd.DataFrame({'Accuracy': CV_xgb.cv_results_['mean_test_score']})
params = pd.DataFrame(CV_xgb.cv_results_['params']) 
accuracy_result = params.join(mean_test).sort_values('Accuracy', ascending=False) #display top 10 results
accuracy_result.head(10)

Unnamed: 0,learning_rate,max_depth,n_estimators,Accuracy
76,0.1,5,66,0.6975
62,0.1,4,101,0.696667
61,0.1,4,96,0.69625
73,0.1,5,51,0.695833
77,0.1,5,71,0.695417
75,0.1,5,61,0.695417
74,0.1,5,56,0.695417
59,0.1,4,86,0.695
58,0.1,4,81,0.694167
60,0.1,4,91,0.694167


In [None]:
xgb2 = XGBClassifier(n_estimators=50, max_depth=3, 
                     learning_rate=0.1, random_state=12345)
xgb2.fit(train_features, train_labels)
xgb2_predictions = xgb2.predict(test_features)
print('XGBoost with Optimized Parameters Accuracy:', accuracy_score(xgb2_predictions, test_labels))

# Imputations

In [None]:
clean_merged1 = pd.read_csv('/Users/Kenny/Dropbox/Stats-101C-Kaggle/Datasets/cleaned_merged1.csv') 
print(clean_merged1.shape)
print(clean_merged1.columns)
clean_merged1.head()

# Ensemble Model

In [None]:
from sklearn.model_selection import StratifiedKFold

X = train_features
y = train_labels.copy()
y[y=='N'] = 0
y[y=='F'] = 1
y = y.astype(int)


In [847]:
from mlxtend.classifier import StackingClassifier

clf1 = RandomForestClassifier(n_estimators=200, criterion='gini',max_depth=7, max_features='auto')
clf2 = RandomForestClassifier(n_estimators=200, criterion='entropy',max_depth=7,
                             max_features='auto')
clf3 = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=56)

# clf4 = SVC(C=1, gamma=0.1)
lr = LogisticRegression()
classifiers = [clf1,clf2,clf3]

sclf = StackingClassifier(classifiers=classifiers, 
                          average_probas=False,
                          meta_classifier=lr)
print('3-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['Random Forest Gini', 
                       'Random Forest Entropy', 
                       'XGBoost',
                       'StackingClassifier']):
    scores = cross_val_score(clf, X, y, cv=3)
    print("Accuracy: %0.5f (+/- %0.5f) [%s]" 
          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.70330 (+/- 0.00828) [Random Forest Gini]
Accuracy: 0.70241 (+/- 0.00931) [Random Forest Entropy]
Accuracy: 0.71353 (+/- 0.00911) [XGBoost]
Accuracy: 0.70018 (+/- 0.00423) [StackingClassifier]


In [960]:
classifiers = [clf1, clf3]
def most_frequent_class_train(features, labels, classifiers):
    trained_models = []
    for clf in classifiers:
        clf.fit(features, labels)
        trained_models.append(clf)
    return(trained_models)

def most_frequent_class_results(trained_model_list, features):
    results = []
    for model in trained_model_list:
        prediction = model.predict(features)
        results.append(prediction)
    results_df = pd.DataFrame({
        'rf_gini': results[0],
        'rf_entropy': results[1],
        'xgb': results[2]
    })
    most_frequent = results_df.mode(axis=1)
    most_frequent.columns = ['most_frequent']
    results_df = results_df.join(most_frequent)
    results_df.most_frequent.value_counts(normalize=True)
    return(results_df)

def intersection_results(trained_model_list, features):
    results = []
    for model in trained_models:
        prediction = model.predict(features)
        results.append(prediction)
    results_df = pd.DataFrame({
    'rf_gini': results[0],
    'xgb': results[1]
    })
    return(results_df)


trained_models = most_frequent_class_train(features,labels, classifiers)

results_df = intersection_results(trained_models, final_test_features)
    
results_df.iloc[:,[1,0]].to_csv('PythonResults/python3.csv', index=False)

#intersection = intersection_results(trained_models, test_features)
    
#trained_models = most_frequent_class_train(train_features, train_labels, classifiers)
#results = most_frequent_class_results(trained_models, test_features)
    
#for i, model in enumerate(results.columns):
#    acc = accuracy_score(results_df.iloc[:,i], test_labels)
#    print(model, acc)



In [887]:
final_test = pd.read_csv('Datasets/test_clean_merged.csv')
final_test2 = final_test[features_list[1:]]
final_test_features = pd.get_dummies(final_test2)

# first training the this ensemble model of most frequent class
full_trained = most_frequent_class_train(features, labels, classifiers)
results_full_trained = most_frequent_class_results(full_trained ,final_test_features)
results_full_trained.rf_entropy.value_counts(normalize=True)

results_df = pd.DataFrame({'id': final_test.id,
                          'Fatal': results_full_trained.most_frequent})
results_df[results_df.Fatal == 'F'].Fatal = 'Yes'
results_df[results_df.Fatal == 'N'].Fatal = 'No'

results_df.iloc[:,[1,0]].to_csv('PythonResults/python2.csv', index=False)

In [None]:
final_test = pd.read_csv('Datasets/test_clean_merged.csv')
final_test2 = final_test[features_list[1:]]
final_test_features = pd.get_dummies(final_test2)

labels_full = labels.copy()
labels_full[labels_full=='N'] = 0
labels_full[labels_full=='F'] = 1
labels_full_int = labels_full.astype(int)

sclf.fit(features,labels_full_int)

final_test_predict = sclf.predict(final_test_features).astype(str)
final_test_predict[final_test_predict == '1'] = 'Yes'
final_test_predict[final_test_predict == '0'] = 'No'

final_test_df = pd.DataFrame({'id': final_test.id,
                              'Fatal': final_test_predict})

final_test_output = final_test_df.iloc[:,[1,0]]
final_test_output.to_csv('PythonResults/python1.csv', index=False)