# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier

import datetime
import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df['idx'] = df['id']
df = df.set_index('idx')
# df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
# df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
print(df.shape)
# pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data_ds2 LIMIT 5''',engine)
df.head()

(147803, 19)


Unnamed: 0_level_0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,usd_goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1725323227,1725323227,Inspire young girls,failed,fashion,Childrenswear,1,0.0,30.0,1300.0,US,USD,30,2018-01-12,2017-12-13,2017-12-08,0,0,Rayna,6
2065169465,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,1,Holly Weist,5
1516902916,1516902916,Dreaming Creek Brewery,failed,food,Drinks,64,0.3,6139.0,20000.0,US,USD,30,2018-01-12,2017-12-13,2017-08-11,0,0,Mike Bradley,19
1396766240,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,0,B.Y. Eidelman,16
549309727,549309727,Higher Grounds Coffee Company Mobile Coffee Tr...,successful,food,Food Trucks,61,1.0,10060.0,10000.0,US,USD,36,2018-01-12,2017-12-07,2017-08-16,1,0,Katreena Powell,21


In [6]:
start_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d").date()

df = df[df['launched'] >= start_date] # filter from start date to current
df.shape

(38402, 19)

In [7]:
df.columns

Index(['id', 'name', 'state', 'category_main', 'category_name',
       'backers_count', 'pct_goal_achieved', 'usd_pledged', 'usd_goal',
       'country', 'currency', 'campaign_length', 'deadline', 'launched',
       'created', 'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')

In [9]:
df_dummies = pd.get_dummies(df[['state','category_name', 'country','currency']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.info()

(38416, 367)
Index(['state_successful_x', 'category_name_Academic_x',
       'category_name_Accessories_x', 'category_name_Action_x',
       'category_name_Animals_x', 'category_name_Animation_x',
       'category_name_Anthologies_x', 'category_name_Apparel_x',
       'category_name_Apps_x', 'category_name_Architecture_x',
       ...
       'country', 'currency', 'campaign_length', 'deadline', 'launched',
       'created', 'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object', length=367)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 38416 entries, 18520 to 2147422173
Columns: 367 entries, state_successful_x to blurb_length
dtypes: float64(3), int64(6), object(10), uint8(348)
memory usage: 18.6+ MB


In [11]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state_successful_x','id', 'name', 'state', 'category_main','category_name', 'backers_count','pct_goal_achieved', 'usd_pledged', 'country', 'currency',
       'deadline', 'launched', 'created','creator_name','spotlight'], 1)
y = df['state_successful_x']

print(X.shape)
X.columns
# X.head()

(38416, 351)


Index(['category_name_Academic_x', 'category_name_Accessories_x',
       'category_name_Action_x', 'category_name_Animals_x',
       'category_name_Animation_x', 'category_name_Anthologies_x',
       'category_name_Apparel_x', 'category_name_Apps_x',
       'category_name_Architecture_x', 'category_name_Art Books_x',
       ...
       'currency_MXN_y', 'currency_NOK_y', 'currency_NZD_y', 'currency_SEK_y',
       'currency_SGD_y', 'currency_USD_y', 'usd_goal', 'campaign_length',
       'staff_pick', 'blurb_length'],
      dtype='object', length=351)

## Assumptions

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(26891, 351) (26891,)
(11525, 351) (11525,)


In [13]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
# print(X_train_s.mean(axis=0))
# print(X_train_s.std(axis=0))

# X_combined_s = np.vstack((X_train_s, X_test_s))
# y_combined = np.hstack((y_train, y_test))

# Phase 3: GridSearchCV

## GridSearchCV: Logistic Regression

In [15]:
model = LogisticRegression()
param_grid = [{'penalty': ['l1','l2'],'C': [0.001,0.01,0.1,1,10,100,1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   2.2s
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   2.3s
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   2.2s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   2.7s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   2.7s
[CV] C=0.01, pen

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min


[CV] ................................ C=100, penalty=l1, total=  22.3s
[CV] C=1000, penalty=l1 ..............................................
[CV] ................................ C=100, penalty=l1, total=  25.1s
[CV] C=1000, penalty=l1 ..............................................
[CV] ................................ C=100, penalty=l2, total=   7.2s
[CV] C=1000, penalty=l2 ..............................................
[CV] ............................... C=1000, penalty=l2, total=   7.6s
[CV] C=1000, penalty=l2 ..............................................
[CV] ............................... C=1000, penalty=l2, total=   9.6s
[CV] C=1000, penalty=l2 ..............................................
[CV] ............................... C=1000, penalty=l1, total=  21.6s
[CV] ............................... C=1000, penalty=l1, total=  24.7s
[CV] ............................... C=1000, penalty=l1, total=  28.6s
[CV] ............................... C=1000, penalty=l2, total=   5.6s


[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:  1.9min finished


{'C': 0.1, 'penalty': 'l1'} LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) [[5292  956]
 [2167 4154]]              precision    recall  f1-score   support

          0       0.71      0.85      0.77      6248
          1       0.81      0.66      0.73      6321

avg / total       0.76      0.75      0.75     12569



## GridSearchCV: Naive Bayes

In [16]:
model = BernoulliNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.7s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.7s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.8s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.8s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.9s
[CV] ............

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    7.6s finished


{'alpha': 2.0} BernoulliNB(alpha=2.0, binarize=0.0, class_prior=None, fit_prior=True) [[5252  996]
 [2206 4115]]              precision    recall  f1-score   support

          0       0.70      0.84      0.77      6248
          1       0.81      0.65      0.72      6321

avg / total       0.75      0.75      0.74     12569



In [17]:
model = MultinomialNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.4s
[CV] ........................................ alpha=0.0, total=   0.3s
[CV] ........................................ alpha=0.0, total=   0.4s
[CV] ..................................... alpha=0.0001, total=   0.3s
[CV] alpha=0.0001 ....................................................
[CV] alpha=0.0001 ....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ...................................... alpha=0.001, total=   0.3s
[CV] ............

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    4.0s finished


{'alpha': 0.5} MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True) [[5004 1244]
 [2029 4292]]              precision    recall  f1-score   support

          0       0.71      0.80      0.75      6248
          1       0.78      0.68      0.72      6321

avg / total       0.74      0.74      0.74     12569



## GridSearchCV: Random Forest

In [18]:
model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   3.8s
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   4.3s
[CV] n_estimators=100, max_features=auto .............................
[CV] ............... n_estimators=10, max_features=auto, total=   4.3s
[CV] n_estimators=500, max_features=auto .............................
[CV] .............. n_estimators=100, max_features=auto, total=  36.8s
[CV] n_estimators=500, max_features=auto .............................
[CV] .............. n_estimators=100, max_features=auto, total=  37.0s
[CV] n_estimator

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 24.4min finished


{'n_estimators': 1000, 'max_features': 'sqrt'} RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [[4814 1434]
 [1996 4325]]              precision    recall  f1-score   support

          0       0.71      0.77      0.74      6248
          1       0.75      0.68      0.72      6321

avg / total       0.73      0.73      0.73     12569



## GridSearchCV: Gradient Boost

In [19]:
model = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   8.9s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   9.0s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=   9.1s
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.1min
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.1min
[CV] n_estimators

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.7min finished


{'n_estimators': 500} GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False) [[5338  910]
 [2217 4104]]              precision    recall  f1-score   support

          0       0.71      0.85      0.77      6248
          1       0.82      0.65      0.72      6321

avg / total       0.76      0.75      0.75     12569



## GridSearchCV: AdaBoost

In [24]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
param_grid = [{"base_estimator__criterion" : ["gini", "entropy"],
               "base_estimator__splitter" :   ["best", "random"],
               'n_estimators': [1, 2],
               'base_estimator__max_depth': [1, 2],
               'algorithm': ['SAMME', 'SAMME.R']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=1 
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=1 
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=1 
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=2 
[CV]  algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=1, total=   0.4s
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=2 
[CV]  algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=gini, n_estimators=1, total=   0.4s

[CV]  algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2, total=   0.5s
[CV] algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=1 
[CV] algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=1 
[CV]  algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=1, total=   0.3s
[CV]  algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV] algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2 
[CV] algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2 
[CV]  algorith

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.3s


[CV]  algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=1 
[CV]  algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=1 
[CV]  algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=1, total=   0.3s
[CV]  algorithm=SAMME, base_estimator__splitter=random, base_estimator__max_depth=1, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV] algorithm=SAMME, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=2 
[CV

[CV] algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=1 
[CV]  algorithm=SAMME.R, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=2, total=   0.6s
[CV]  algorithm=SAMME.R, base_estimator__splitter=best, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=2, total=   0.6s
[CV] algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=1 
[CV] algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=1 
[CV]  algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=1, total=   0.4s
[CV] algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=gini, n_estimators=2 
[CV]  algorithm=SAMME

[CV] algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=2 
[CV]  algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=1, total=   0.6s
[CV]  algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=2, total=   0.7s
[CV]  algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=2, total=   0.7s
[CV]  algorithm=SAMME.R, base_estimator__splitter=random, base_estimator__max_depth=2, base_estimator__criterion=entropy, n_estimators=2, total=   0.5s


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   15.4s finished


{'algorithm': 'SAMME.R', 'base_estimator__splitter': 'best', 'base_estimator__max_depth': 2, 'base_estimator__criterion': 'gini', 'n_estimators': 2} AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=2, random_state=None) [[5225 1023]
 [3429 2892]]              precision    recall  f1-score   support

          0       0.60      0.84      0.70      6248
          1       0.74      0.46      0.57      6321

avg / total       0.67      0.65      0.63     12569



## GridSearchCV: KNN

In [14]:
# https://stackoverflow.com/questions/37678471/i-am-trying-to-implement-gridsearchcv-to-tune-the-parameters-of-k-nearest-neighb
model = KNeighborsClassifier()
param_grid = [{'n_neighbors': list(range(1,101,3))}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 34 candidates, totalling 102 fits
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total= 1.8min
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total= 1.8min
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total= 2.1min
[CV] n_neighbors=7 ...................................................
[CV] .................................... n_neighbors=4, total= 2.4min
[CV] n_neighbors=7 ...................................................
[CV] .................................... n_neighbors=4, total= 1.9min
[CV] n_neighbor

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 50.0min


[CV] ................................... n_neighbors=34, total= 2.8min
[CV] n_neighbors=37 ..................................................
[CV] ................................... n_neighbors=34, total= 2.7min
[CV] n_neighbors=37 ..................................................
[CV] ................................... n_neighbors=34, total= 2.7min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 2.7min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 2.5min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 2.9min
[CV] n_neighbors=43 ..................................................
[CV] ................................... n_neighbors=40, total= 3.0min
[CV] n_neighbors=43 ..................................................
[CV] .

[CV] ................................... n_neighbors=91, total= 2.8min
[CV] n_neighbors=94 ..................................................
[CV] ................................... n_neighbors=91, total= 3.1min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 3.1min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 2.6min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 2.6min
[CV] n_neighbors=100 .................................................
[CV] ................................... n_neighbors=97, total= 2.4min
[CV] n_neighbors=100 .................................................
[CV] ................................... n_neighbors=97, total= 2.0min
[CV] n_neighbors=100 .................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed: 187.6min finished


{'n_neighbors': 7} KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform') [[4944  256]
 [ 296 6029]]              precision    recall  f1-score   support

          0       0.94      0.95      0.95      5200
          1       0.96      0.95      0.96      6325

avg / total       0.95      0.95      0.95     11525



## XG Boost

In [15]:
model = XGBClassifier()
param_grid = [{'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic','reg:linear'],
              'learning_rate': [0.05,0.1], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7,0.8],
              'n_estimators': [5,500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [0,1337]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total=   1.5s
[CV] missin

[CV]  missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total= 1.7min
[CV] missing=-999, learning_rate=0.1, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total= 1.7min
[CV] missing=-999, learning_rate=0.1, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total= 1.7min
[CV] missing=-999, learning_rate=0.1, objective=bi

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min


[CV]  missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total=   1.4s
[CV] missing=-999, learning_rate=0.1, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.7, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total=   1.4s
[CV] missing=-999, learning_rate=0.1, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.1, objective=binary:logistic, max_depth=6, colsample_bytree=0.7, n_estimators=500, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total= 1.4min
[CV] missing=-999, learning_rate=0.1, objecti

[CV] missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1 
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.8, n_estimators=500, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total=   1.5s
[CV] missing=-999, learning_rate=0.05, objective=binary:logistic, max_depth=6, colsample_bytree=0.8, n_estimators=500, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.05, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1, total=   1.6s
[CV] missing=-999, learning_rate=0.05, objective=binary:logist

[CV]  missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total=   1.7s
[CV] missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total=   1.8s
[CV] missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=1337, subsample=0.8, silent=1 
[CV]  missing=-999, learning_rate=0.1, objective=reg:linear, max_depth=6, colsample_bytree=0.8, n_estimators=5, nthread=4, min_child_weight=11, seed=0, subsample=0.8, silent=1, total=   1.7s
[CV] missing=-999, learning_rate=0.1, objective=binary:logistic, max_

[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 20.8min finished


{'missing': -999, 'learning_rate': 0.05, 'objective': 'reg:linear', 'colsample_bytree': 0.7, 'n_estimators': 500, 'seed': 0, 'min_child_weight': 11, 'max_depth': 6, 'nthread': 4, 'subsample': 0.8, 'silent': 1} XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.8) [[5348  900]
 [2201 4120]]              precision    recall  f1-score   support

          0       0.71      0.86      0.78      6248
          1       0.82      0.65      0.73      6321

avg / total       0.76      0.75      0.75     12569

