# Predicting the Success of a Kickstarter Campaign
Utilized GridSearchCV to optimize model parameters.

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# %matplotlib inline
# import seaborn as sns

from sqlalchemy import create_engine

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler 

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore")



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
# df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
df['idx'] = df['id']
df = df.set_index('idx')
df = df.drop(['index'],1)
print(df.shape)
pd.read_sql_query('''SELECT state, category_main, category_name, backers_count, pct_goal_achieved, usd_pledged, usd_goal, country, currency, campaign_length, staff_pick, blurb_length, launched FROM kickstarter_data_ds2 LIMIT 5''',engine)

(163425, 19)


Unnamed: 0,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,usd_goal,country,currency,campaign_length,staff_pick,blurb_length,launched
0,1,fashion,Fashion,30,1.0,1555.0,1500.0,US,USD,31,0,25,2011-12-01
1,0,art,Painting,1,0.0,120.0,3456.0,US,USD,46,0,24,2011-11-16
2,1,film_and_video,Shorts,43,1.5,1831.0,1200.0,US,USD,25,0,18,2011-12-07
3,1,film_and_video,Shorts,6,1.0,1010.0,1000.0,US,USD,30,0,25,2011-12-02
4,1,film_and_video,Shorts,20,1.1,1580.0,1500.0,US,USD,27,0,24,2011-12-05


In [6]:
start_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d").date()

df = df[df['launched'] >= start_date] # filter from start date to current
df = df[['state','category_main','category_name','backers_count','usd_goal','country','currency','campaign_length',
        'staff_pick','blurb_length']]
df = df.dropna()

df.shape

(38401, 10)

In [7]:
df.columns

Index(['state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object')

In [8]:
df_dummies = pd.get_dummies(df[['category_main','category_name', 'country','currency']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.info()

(38401, 197)
Index(['category_main_comics', 'category_main_crafts', 'category_main_dance',
       'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       'category_main_music',
       ...
       'state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object', length=197)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 38401 entries, 1396766240 to 1094069811
Columns: 197 entries, category_main_comics to blurb_length
dtypes: float64(1), int64(5), object(4), uint8(187)
memory usage: 11.3+ MB


In [9]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state','category_main','category_name', 'backers_count', 'country', 'currency'], 1)
y = df['state']

print(X.shape)
X.columns
# X.head()

(38401, 191)


Index(['category_main_comics', 'category_main_crafts', 'category_main_dance',
       'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       'category_main_music',
       ...
       'currency_MXN', 'currency_NOK', 'currency_NZD', 'currency_SEK',
       'currency_SGD', 'currency_USD', 'usd_goal', 'campaign_length',
       'staff_pick', 'blurb_length'],
      dtype='object', length=191)

## Assumptions

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(26880, 191) (26880,)
(11521, 191) (11521,)


In [11]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

# Phase 3: GridSearchCV

## GridSearchCV: Logistic Regression

In [14]:
model = LogisticRegression()
param_grid = [{'penalty': ['l1','l2'],'C': [0.001,0.01,0.1,1,10,100,1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   1.3s
[CV] .............................. C=0.001, penalty=l1, total=   1.4s
[CV] C=0.001, penalty=l2 .............................................
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   1.5s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   2.2s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   1.4s
[CV] ...........

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.1min


[CV] ................................ C=100, penalty=l2, total=   9.6s
[CV] C=1000, penalty=l1 ..............................................
[CV] ................................ C=100, penalty=l2, total=  19.0s
[CV] C=1000, penalty=l1 ..............................................
[CV] ............................... C=1000, penalty=l1, total=  14.0s
[CV] C=1000, penalty=l2 ..............................................
[CV] ................................ C=100, penalty=l1, total=  49.9s
[CV] C=1000, penalty=l2 ..............................................
[CV] ............................... C=1000, penalty=l1, total=  21.6s
[CV] C=1000, penalty=l2 ..............................................
[CV] ............................... C=1000, penalty=l2, total=   9.7s
[CV] ............................... C=1000, penalty=l1, total=  30.2s
[CV] ............................... C=1000, penalty=l2, total=   9.7s
[CV] ............................... C=1000, penalty=l2, total=  14.2s


[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:  2.6min finished


{'C': 1000, 'penalty': 'l1'} LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) [[3925 1276]
 [1583 4737]]              precision    recall  f1-score   support

          0       0.71      0.75      0.73      5201
          1       0.79      0.75      0.77      6320

avg / total       0.75      0.75      0.75     11521



## GridSearchCV: Naive Bayes

In [15]:
model = BernoulliNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.4s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.6s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.8s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.7s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.5s
[CV] alpha=0.001 

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    5.4s finished


{'alpha': 0.1} BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True) [[3750 1451]
 [1632 4688]]              precision    recall  f1-score   support

          0       0.70      0.72      0.71      5201
          1       0.76      0.74      0.75      6320

avg / total       0.73      0.73      0.73     11521



In [16]:
model = MultinomialNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.2s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.2s
[CV] alpha=0.0001 ....................................................
[CV] ..................................... alpha=0.0001, total=   0.2s
[CV] ........................................ alpha=0.0, total=   0.2s
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.2s
[CV] alpha=0.001 

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.5s finished


## GridSearchCV: Random Forest

In [17]:
model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   1.9s
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   1.9s
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   1.9s
[CV] n_estimators=500, max_features=auto .............................
[CV] .............. n_estimators=100, max_features=auto, total=  15.2s
[CV] n_estimators=500, max_features=auto .............................
[CV] .............. n_estimators=100, max_features=auto, total=  14.9s
[CV] ...........

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  9.8min finished


{'n_estimators': 1000, 'max_features': 'auto'} RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [[3816 1385]
 [1439 4881]]              precision    recall  f1-score   support

          0       0.73      0.73      0.73      5201
          1       0.78      0.77      0.78      6320

avg / total       0.76      0.75      0.75     11521



## GridSearchCV: Gradient Boost

In [18]:
model = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   7.1s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   7.3s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   7.3s
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.1min
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.1min
[CV] n_estimators

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 12.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 12.8min finished


{'n_estimators': 500} GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False) [[4036 1165]
 [1456 4864]]              precision    recall  f1-score   support

          0       0.73      0.78      0.75      5201
          1       0.81      0.77      0.79      6320

avg / total       0.77      0.77      0.77     11521



## GridSearchCV: AdaBoost

In [19]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
param_grid = [{"base_estimator__criterion" : ["gini", "entropy"],
               "base_estimator__splitter" :   ["best", "random"],
               'n_estimators': [1, 2],
               'base_estimator__max_depth': [1, 2],
               'algorithm': ['SAMME', 'SAMME.R']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best 
[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best 
[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best 
[CV] n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best 
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best, total=   0.2s
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best, total=   0.2s
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=gini, base_estimator__splitter=best

[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random 
[CV]  n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=best, total=   0.4s
[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random 
[CV]  n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=best, total=   0.5s
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random, total=   0.4s
[CV] n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random 
[CV] n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random 
[CV]  n_estima

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.2s


[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best 
[CV] n_estimators=1, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best 
[CV]  n_estimators=2, algorithm=SAMME, base_estimator__max_depth=1, base_estimator__criterion=entropy, base_estimator__splitter=random, total=   0.3s
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best, total=   0.2s
[CV] n_estimators=2, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best 
[CV] n_estimators=2, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best 
[CV]  n_estimators=1, algorithm=SAMME, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=best, total=   0.3s
[CV] n_estimators=2, a

[CV] n_estimators=1, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=random 
[CV]  n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=best, total=   0.3s
[CV] n_estimators=1, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=random 
[CV]  n_estimators=1, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=random, total=   0.2s
[CV] n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=random 
[CV]  n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=best, total=   0.3s
[CV] n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=gini, base_estimator__splitter=random 
[CV]  n_estimators=1,

[CV]  n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=random, total=   0.3s
[CV]  n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=random, total=   0.3s
[CV]  n_estimators=2, algorithm=SAMME.R, base_estimator__max_depth=2, base_estimator__criterion=entropy, base_estimator__splitter=random, total=   0.2s


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   10.4s finished


{'n_estimators': 2, 'algorithm': 'SAMME.R', 'base_estimator__max_depth': 2, 'base_estimator__criterion': 'gini', 'base_estimator__splitter': 'best'} AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=2, random_state=None) [[3276 1925]
 [1657 4663]]              precision    recall  f1-score   support

          0       0.66      0.63      0.65      5201
          1       0.71      0.74      0.72      6320

avg / total       0.69      0.69      0.69     11521



## GridSearchCV: KNN

In [20]:
# https://stackoverflow.com/questions/37678471/i-am-trying-to-implement-gridsearchcv-to-tune-the-parameters-of-k-nearest-neighb
model = KNeighborsClassifier()
param_grid = [{'n_neighbors': list(range(1,101,2))}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] .................................... n_neighbors=1, total= 1.0min
[CV] n_neighbors=3 ...................................................
[CV] .................................... n_neighbors=1, total= 1.0min
[CV] n_neighbors=3 ...................................................
[CV] .................................... n_neighbors=1, total= 1.0min
[CV] n_neighbors=5 ...................................................
[CV] .................................... n_neighbors=3, total= 1.1min
[CV] n_neighbors=5 ...................................................
[CV] .................................... n_neighbors=3, total=  55.0s
[CV] n_neighbor

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 27.7min


[CV] ................................... n_neighbors=23, total= 1.3min
[CV] n_neighbors=25 ..................................................
[CV] ................................... n_neighbors=23, total= 1.3min
[CV] n_neighbors=25 ..................................................
[CV] ................................... n_neighbors=23, total= 1.2min
[CV] n_neighbors=27 ..................................................
[CV] ................................... n_neighbors=25, total= 1.3min
[CV] n_neighbors=27 ..................................................
[CV] ................................... n_neighbors=25, total= 1.4min
[CV] n_neighbors=27 ..................................................
[CV] ................................... n_neighbors=25, total= 1.6min
[CV] n_neighbors=29 ..................................................
[CV] ................................... n_neighbors=27, total= 1.8min
[CV] n_neighbors=29 ..................................................
[CV] .

[CV] ................................... n_neighbors=61, total= 1.3min
[CV] n_neighbors=63 ..................................................
[CV] ................................... n_neighbors=61, total= 1.4min
[CV] n_neighbors=65 ..................................................
[CV] ................................... n_neighbors=63, total= 1.4min
[CV] n_neighbors=65 ..................................................
[CV] ................................... n_neighbors=63, total= 1.3min
[CV] n_neighbors=65 ..................................................
[CV] ................................... n_neighbors=63, total= 1.3min
[CV] n_neighbors=67 ..................................................
[CV] ................................... n_neighbors=65, total= 1.5min
[CV] n_neighbors=67 ..................................................
[CV] ................................... n_neighbors=65, total= 1.5min
[CV] n_neighbors=67 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 168.7min finished


{'n_neighbors': 25} KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=25, p=2,
           weights='uniform') [[3800 1401]
 [1807 4513]]              precision    recall  f1-score   support

          0       0.68      0.73      0.70      5201
          1       0.76      0.71      0.74      6320

avg / total       0.72      0.72      0.72     11521



## XG Boost

In [12]:
model = XGBClassifier()
param_grid = [{'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic','reg:linear'],
              'learning_rate': [0.05,0.1], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7,0.8],
              'n_estimators': [5,500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [0,1337]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6 
[CV] learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6 
[CV] learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6 
[CV] learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6, total=   1.9s
[CV]  learn

[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=0, subsample=0.8, max_depth=6, total= 1.5min
[CV] learning_rate=0.1, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=1337, subsample=0.8, max_depth=6, total= 1.5min
[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=1337, subsample=0.8, max_depth=6, total= 1.5min
[CV] learning_rate=0.1, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6 
[CV] learning_rate=0.1, objective=binary:logistic,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.4min


[CV]  learning_rate=0.1, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6, total=   1.1s
[CV] learning_rate=0.1, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=0, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.1, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6, total=   1.2s
[CV] learning_rate=0.1, objective=binary:logistic, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=0, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.1, objective=reg:linear, colsample_bytree=0.7, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6, total=   1.2s
[CV] learning_rate=0.1, objective=binary:logistic, c

[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6, total=   1.4s
[CV] learning_rate=0.05, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6, total=   1.4s
[CV] learning_rate=0.05, objective=binary:logistic, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=500, seed=0, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.05, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6, total=   1.3s
[CV] learning_rate=0.05, objective=binary:logistic, colsa

[CV]  learning_rate=0.1, objective=binary:logistic, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6, total=   1.5s
[CV] learning_rate=0.1, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.1, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6, total=   1.5s
[CV] learning_rate=0.1, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=1337, subsample=0.8, max_depth=6 
[CV]  learning_rate=0.1, objective=reg:linear, colsample_bytree=0.8, min_child_weight=11, missing=-999, nthread=4, silent=1, n_estimators=5, seed=0, subsample=0.8, max_depth=6, total=   1.6s
[CV] learning_rate=0.1, objective=reg:linear, colsample_bytre

[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 19.4min finished


{'learning_rate': 0.05, 'objective': 'reg:linear', 'colsample_bytree': 0.7, 'seed': 1337, 'nthread': 4, 'missing': -999, 'silent': 1, 'n_estimators': 500, 'min_child_weight': 11, 'subsample': 0.8, 'max_depth': 6} XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1337, silent=1, subsample=0.8) [[4037 1164]
 [1445 4875]]              precision    recall  f1-score   support

          0       0.74      0.78      0.76      5201
          1       0.81      0.77      0.79      6320

avg / total       0.78      0.77      0.77     11521



## SVM

In [13]:
model = SVC()
param_grid = [{'C': [0.001, 0.01, 0.1, 1, 10],'gamma':[0.001, 0.01, 0.1, 1]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=0.001, gamma=0.001 ............................................
[CV] C=0.001, gamma=0.001 ............................................
[CV] C=0.001, gamma=0.001 ............................................
[CV] C=0.001, gamma=0.01 .............................................
[CV] .............................. C=0.001, gamma=0.01, total= 4.1min
[CV] C=0.001, gamma=0.01 .............................................
[CV] ............................. C=0.001, gamma=0.001, total= 4.4min
[CV] C=0.001, gamma=0.01 .............................................
[CV] ............................. C=0.001, gamma=0.001, total= 4.4min
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ............................. C=0.001, gamma=0.001, total= 4.4min
[CV] C=0.001, gamma=0.1 ..............................................
[CV] .............................. C=0.001, gamma=0.01, total= 5.7min
[CV] C=0.001, ga

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 58.0min


[CV] ................................... C=0.1, gamma=1, total= 3.8min
[CV] C=1, gamma=0.001 ................................................
[CV] ................................... C=0.1, gamma=1, total= 3.8min
[CV] C=1, gamma=0.001 ................................................
[CV] ................................... C=0.1, gamma=1, total= 3.8min
[CV] C=1, gamma=0.01 .................................................
[CV] ................................. C=1, gamma=0.001, total= 2.8min
[CV] C=1, gamma=0.01 .................................................
[CV] ................................. C=1, gamma=0.001, total= 2.8min
[CV] C=1, gamma=0.01 .................................................
[CV] ................................. C=1, gamma=0.001, total= 2.9min
[CV] C=1, gamma=0.1 ..................................................
[CV] .................................. C=1, gamma=0.01, total= 3.0min
[CV] C=1, gamma=0.1 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 100.5min finished


{'C': 10, 'gamma': 0.001} SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) [[4034 1167]
 [1866 4454]]              precision    recall  f1-score   support

          0       0.68      0.78      0.73      5201
          1       0.79      0.70      0.75      6320

avg / total       0.74      0.74      0.74     11521

