In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, make_scorer,\
                            recall_score, roc_curve, auc, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle

# def my_scoring(*args):
#     score = roc_auc_score(*args)
#     print("Score:{}".format(score))
#     return score


def train_model(X_train, X_test, y_train, y_test):
    param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#     param_range = [1, 5, 9]#[1, 3, 5, 7, 9]
    param_range_fl = [1.0, 0.5, 0.1]
    
    model_list = {
                  'rfc': RandomForestClassifier(random_state=0, verbose=20),
                  'svc': SVC(random_state=0, verbose=25),
                  'lr': LogisticRegression(random_state=0, verbose=True),
            #     MLPClassifier(random_state=0)                    
                 }
    grid_rfc_params = {
#                         'clf__criterion': ['gini', 'entropy'],
                        'min_samples_leaf': param_range,
                        'max_depth': param_range,
                        'min_samples_split': param_range[1:]
                      }
    grid_svc_params = {
                        'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                        'C': param_range
                      }
    
    grid_lr_params = [{
                        'penalty': ['l1', 'l2'],
                        'C': param_range_fl,
#                         'solver': ['liblinear']
                     }]

#     rfc = RandomForestClassifier(n_estimators=10, random_state=0).fit(X_train, y_train)
    gc_rfc = GridSearchCV(model_list['rfc'], param_grid=grid_rfc_params, scoring=make_scorer(roc_auc_score), cv=10, n_jobs=-1)
    gc_svc = GridSearchCV(model_list['svc'], param_grid=grid_svc_params, scoring=make_scorer(roc_auc_score), cv=10, n_jobs=-1)
    gc_lr = GridSearchCV(model_list['lr'], param_grid=grid_lr_params, scoring=make_scorer(roc_auc_score), cv=10, n_jobs=-1)
    
    grid_search_models = {
                          'gc_rfc': gc_rfc,
#                           'gc_svc': gc_svc,
                          'gc_lr': gc_lr
                       }
    
    model_names = {'gc_rfc': 'Random Forest', 'gc_svc': 'Support Vector Machine', 'gc_lr': 'Logistic Regression'}
    
    best_score = 0.0
    best_model = gc_rfc
    best_model_index = 0
    print("Training Scores:")
    for key, estimator in grid_search_models.items():
        start_time = time.clock()
        clf = estimator
        clf.fit(X_train, y_train)
        
        fpr, tpr, _ = roc_curve(y_train, clf.predict_proba(X_train)[:,1])
        cur_train_score = auc(fpr, tpr)
        
        fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
        cur_test_score = auc(fpr, tpr)
        
        print(model_names[key])
        print("TrainingData Score:{0}".format(cur_train_score))
        print("Test Data score {0}".format(cur_test_score))
        print("and best params {0}".format(clf.best_params_))
        if cur_test_score > best_score:
            best_score = cur_test_score
            best_model = clf
            best_model_index = key
        print("Time taken: {}\n".format(time.clock() - start_time))
    
    print("The best model found is: {0}".format(best_model.best_estimator_))
    print("Parameters are: {0}".format(best_model.best_params_))
        
#     with open('blight_model.p', 'wb') as f:
#         pickle.dump(best_model, f)


def print_scores(X, y):
    with open('blight_model.p', 'rb') as f:
        rfc = pickle.load(f)
    y_predict = rfc.predict(X)
    print("Accuracy Score: {0}".format(accuracy_score(y, y_predict)))
    print("Precision Score: {0}".format(precision_score(y, y_predict)))
    print("Recall Score: {0}".format(recall_score(y, y_predict)))
    probs = rfc.predict_proba(X)
    fpr_lr, tpr_lr, _ = roc_curve(y, probs[:,1])
    print("Area under curve {0}".format(auc(fpr_lr, tpr_lr)))


# if __name__ == '__main__':
#     X_train = pd.read_csv('X_train.csv', index_col='ticket_id')
#     X_test = pd.read_csv('X_test.csv', index_col='ticket_id')
#     y_train = pd.read_csv('y_train.csv', index_col=0, header=None)
#     y_test = pd.read_csv('y_test.csv', index_col=0, header=None)
#     train_model(X_train, y_train)
#     print("TrainingData:")
#     print_scores(X_train, y_train)
#     print("TestingData:")
#     print_scores(X_test, y_test)

In [5]:
    X_train = pd.read_csv('X_train.csv', index_col='ticket_id')
    X_test = pd.read_csv('X_test.csv', index_col='ticket_id')
    y_train = pd.read_csv('y_train.csv', index_col=0, header=None)
    y_test = pd.read_csv('y_test.csv', index_col=0, header=None)

In [8]:
X_train.head()

Unnamed: 0_level_0,violation_code_1,violation_code_2,violation_code_3,violation_code_4,violation_code_5,violation_code_6,violation_code_7,violation_code_8,disposition_0,disposition_1,disposition_2,gap_ticket_hear,late_fee,discount_amount,judgment_amount,lat,lon
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
129049,0,0,0,0,0,0,0,1,0,0,1,-0.023518,-0.12808,-0.046551,-0.155625,-0.299528,-0.033055
223337,0,0,0,0,0,0,0,1,0,0,1,0.057514,-0.12808,-0.046551,-0.155625,-0.484728,0.792727
165904,0,0,0,0,0,0,1,0,0,1,0,0.117275,-0.495593,-0.046551,-0.390228,-0.483011,0.011175
90794,0,0,0,0,0,0,1,1,0,0,1,-0.019446,-0.422091,-0.046551,-0.450554,-2.111818,0.099856
112188,0,0,0,0,0,1,0,0,0,0,1,0.070479,-0.12808,-0.046551,-0.155625,0.179421,1.682125


In [6]:
X_train.shape

(114113, 17)

In [7]:
import time

start_time = time.clock()
train_model(X_train, X_test, y_train.values.ravel(), y_test.values.ravel())
print("Total Time taken: {}".format(time.clock() - start_time))

Training Scores:
building tree 1 of 10
building tree 2 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


building tree 3 of 10
building tree 4 of 10
building tree 5 of 10


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s


building tree 6 of 10
building tree 7 of 10
building tree 8 of 10


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapse

Random Forest
TrainingData Score:0.8196618767299867
Test Data score 0.8035211619227143
and best params {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Time taken: 1889.1859747991668

[LibLinear]Logistic Regression
TrainingData Score:0.7762060349068189
Test Data score 0.7928472458009159
and best params {'C': 1.0, 'penalty': 'l1'}
Time taken: 27.349129988032246

The best model found is: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=20, warm_start=False)
Parameters are: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Total Time taken: 1916.5367850072887
