# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier

import datetime
import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)

In [29]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [30]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [31]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [32]:
df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df['idx'] = df['id']
df = df.set_index('idx')
# df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
# df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
print(df.shape)
# pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data_ds2 LIMIT 5''',engine)
df.head()

(163425, 18)


Unnamed: 0_level_0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,usd_goal,country,currency,campaign_length,deadline,launched,created,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1396766240,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,B.Y. Eidelman,16
2065169465,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,Holly Weist,5
1647325451,1647325451,Code Switch,successful,film_and_video,Horror,34,1.5,4611.0,3000.0,US,USD,32,2018-01-12,2017-12-11,2017-11-10,0,Alba Roland,23
727157486,727157486,Rain Dog Farm,failed,food,Farms,49,0.3,4741.0,18000.0,US,USD,38,2018-01-12,2017-12-05,2017-11-28,1,Charlie Wainger,9
1756145145,1756145145,WANGTA: a novel,successful,publishing,Fiction,13,1.0,427.185132,427.185132,CA,CAD,21,2018-01-12,2017-12-22,2017-12-18,0,D. H. de Bruin,22


In [33]:
start_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d").date()

df = df[df['launched'] >= start_date] # filter from start date to current
df = df[['state','category_main','category_name','backers_count','usd_goal','country','currency','campaign_length',
        'staff_pick','blurb_length']]
df = df.dropna()
df.shape

(38401, 10)

In [34]:
df.columns

Index(['state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object')

In [35]:
df_dummies = pd.get_dummies(df[['state','category_main','category_name', 'country','currency']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.info()

(38401, 198)
Index(['state_successful', 'category_main_comics', 'category_main_crafts',
       'category_main_dance', 'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       ...
       'state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object', length=198)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 38401 entries, 1396766240 to 1094069811
Columns: 198 entries, state_successful to blurb_length
dtypes: float64(1), int64(4), object(5), uint8(188)
memory usage: 11.4+ MB


In [37]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state_successful','state','category_main','category_name', 'backers_count', 'country', 'currency'], 1)
y = df['state_successful']

print(X.shape)
X.columns
# X.head()

(38401, 191)


Index(['category_main_comics', 'category_main_crafts', 'category_main_dance',
       'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       'category_main_music',
       ...
       'currency_MXN', 'currency_NOK', 'currency_NZD', 'currency_SEK',
       'currency_SGD', 'currency_USD', 'usd_goal', 'campaign_length',
       'staff_pick', 'blurb_length'],
      dtype='object', length=191)

## Assumptions

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(26880, 191) (26880,)
(11521, 191) (11521,)


In [39]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
# print(X_train_s.mean(axis=0))
# print(X_train_s.std(axis=0))

# X_combined_s = np.vstack((X_train_s, X_test_s))
# y_combined = np.hstack((y_train, y_test))

# Phase 3: GridSearchCV

## GridSearchCV: Logistic Regression

In [40]:
model = LogisticRegression()
param_grid = [{'penalty': ['l1','l2'],'C': [0.001,0.01,0.1,1,10,100,1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] penalty=l1, C=0.001 .............................................
[CV] penalty=l1, C=0.001 .............................................
[CV] penalty=l1, C=0.001 .............................................
[CV] penalty=l2, C=0.001 .............................................
[CV] .............................. penalty=l1, C=0.001, total=   1.5s
[CV] .............................. penalty=l1, C=0.001, total=   1.4s
[CV] penalty=l2, C=0.001 .............................................
[CV] .............................. penalty=l1, C=0.001, total=   1.8s
[CV] penalty=l2, C=0.001 .............................................
[CV] penalty=l1, C=0.01 ..............................................
[CV] .............................. penalty=l2, C=0.001, total=   2.2s
[CV] penalty=l1, C=0.01 ..............................................
[CV] ............................... penalty=l1, C=0.01, total=   1.1s
[CV] penalty=l1,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min


[CV] ................................ penalty=l1, C=100, total=  57.6s
[CV] penalty=l1, C=1000 ..............................................
[CV] ................................ penalty=l2, C=100, total=  13.5s
[CV] penalty=l1, C=1000 ..............................................
[CV] ................................ penalty=l1, C=100, total= 1.1min
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l1, C=1000, total=  22.5s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l1, C=1000, total=  18.4s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=  15.2s
[CV] ............................... penalty=l2, C=1000, total=  13.4s
[CV] ............................... penalty=l2, C=1000, total=  22.4s
[CV] ............................... penalty=l1, C=1000, total=  39.2s


[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:  3.2min finished


{'penalty': 'l1', 'C': 1000} LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) [[3925 1276]
 [1565 4755]]              precision    recall  f1-score   support

          0       0.71      0.75      0.73      5201
          1       0.79      0.75      0.77      6320

avg / total       0.76      0.75      0.75     11521



## GridSearchCV: Naive Bayes

In [41]:
model = BernoulliNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.5s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.5s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.6s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.6s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.5s
[CV] alpha=0.001 

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    5.4s finished


{'alpha': 0.5} BernoulliNB(alpha=0.5, binarize=0.0, class_prior=None, fit_prior=True) [[3755 1446]
 [1638 4682]]              precision    recall  f1-score   support

          0       0.70      0.72      0.71      5201
          1       0.76      0.74      0.75      6320

avg / total       0.73      0.73      0.73     11521



In [42]:
model = MultinomialNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0 .......................................................
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.1s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.2s
[CV] alpha=0.0001 ....................................................
[CV] ........................................ alpha=0.0, total=   0.2s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.2s
[CV] alpha=0.001 .....................................................
[CV] ..................................... alpha=0.0001, total=   0.2s
[CV] alpha=0.001 

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.4s finished


{'alpha': 0.0} MultinomialNB(alpha=0.0, class_prior=None, fit_prior=True) [[1340 3861]
 [ 559 5761]]              precision    recall  f1-score   support

          0       0.71      0.26      0.38      5201
          1       0.60      0.91      0.72      6320

avg / total       0.65      0.62      0.57     11521



## GridSearchCV: Random Forest

In [43]:
model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   2.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   2.4s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   2.4s
[CV] max_features=auto, n_estimators=500 .............................
[CV] .............. max_features=auto, n_estimators=100, total=  18.3s
[CV] max_features=auto, n_estimators=500 .............................
[CV] .............. max_features=auto, n_estimators=100, total=  18.2s
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 15.5min finished


{'max_features': 'auto', 'n_estimators': 1000} RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [[3855 1346]
 [1453 4867]]              precision    recall  f1-score   support

          0       0.73      0.74      0.73      5201
          1       0.78      0.77      0.78      6320

avg / total       0.76      0.76      0.76     11521



## GridSearchCV: Gradient Boost

In [44]:
model = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=  12.0s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=  12.1s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=  12.0s
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.5min
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=100, total= 1.5min
[CV] n_estimators

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.3min finished


{'n_estimators': 500} GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False) [[4065 1136]
 [1432 4888]]              precision    recall  f1-score   support

          0       0.74      0.78      0.76      5201
          1       0.81      0.77      0.79      6320

avg / total       0.78      0.78      0.78     11521



## GridSearchCV: AdaBoost

In [45]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
param_grid = [{"base_estimator__criterion" : ["gini", "entropy"],
               "base_estimator__splitter" :   ["best", "random"],
               'n_estimators': [1, 2],
               'base_estimator__max_depth': [1, 2],
               'algorithm': ['SAMME', 'SAMME.R']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=1 
[CV] base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=1 
[CV] base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=1 
[CV] base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=2 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=1, total=   0.2s
[CV] base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=2 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=gini, n_estimators=1, total=   0.2s

[CV] base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2, total=   0.2s
[CV] base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1 
[CV]  base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1, total=   0.2s
[CV] base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2, total=   0.2s
[CV]  base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1, total=   0.2s

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.3s


[CV] base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1 
[CV]  base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2, total=   0.2s
[CV]  base_estimator__splitter=random, base_estimator__max_depth=1, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2, total=   0.2s
[CV] base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1 
[CV] base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=1, total=   0.3s
[CV] base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME, base_estimator__criterion=entropy, n_estimators=2 
[CV]  base_estimator

[CV]  base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=2, total=   0.4s
[CV]  base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=2, total=   0.4s
[CV] base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=1 
[CV] base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=1 
[CV]  base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=1, total=   0.2s
[CV] base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=2 
[CV]  base_estimator__splitter=best, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=gini, n_estimators=2, total=   0.4s
[CV] bas

[CV]  base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=entropy, n_estimators=1, total=   0.3s
[CV]  base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV]  base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=entropy, n_estimators=2, total=   0.4s
[CV]  base_estimator__splitter=random, base_estimator__max_depth=2, algorithm=SAMME.R, base_estimator__criterion=entropy, n_estimators=2, total=   0.3s


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    9.7s finished


{'n_estimators': 2, 'base_estimator__splitter': 'best', 'algorithm': 'SAMME.R', 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 2} AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=2, random_state=None) [[3294 1907]
 [1642 4678]]              precision    recall  f1-score   support

          0       0.67      0.63      0.65      5201
          1       0.71      0.74      0.72      6320

avg / total       0.69      0.69      0.69     11521



## GridSearchCV: KNN

In [46]:
# https://stackoverflow.com/questions/37678471/i-am-trying-to-implement-gridsearchcv-to-tune-the-parameters-of-k-nearest-neighb
model = KNeighborsClassifier()
param_grid = [{'n_neighbors': list(range(1,101,3))}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 34 candidates, totalling 102 fits
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total=  55.2s
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total=  55.6s
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total=  56.0s
[CV] n_neighbors=7 ...................................................
[CV] .................................... n_neighbors=4, total= 1.0min
[CV] n_neighbors=7 ...................................................
[CV] .................................... n_neighbors=4, total= 1.0min
[CV] n_neighbor

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 28.9min


[CV] ................................... n_neighbors=34, total= 1.5min
[CV] n_neighbors=37 ..................................................
[CV] ................................... n_neighbors=34, total= 1.5min
[CV] n_neighbors=37 ..................................................
[CV] ................................... n_neighbors=34, total= 1.5min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 1.5min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 1.5min
[CV] n_neighbors=40 ..................................................
[CV] ................................... n_neighbors=37, total= 1.4min
[CV] n_neighbors=43 ..................................................
[CV] ................................... n_neighbors=40, total= 1.4min
[CV] n_neighbors=43 ..................................................
[CV] .

[CV] ................................... n_neighbors=91, total= 1.8min
[CV] n_neighbors=94 ..................................................
[CV] ................................... n_neighbors=91, total= 1.4min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 1.5min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 1.5min
[CV] n_neighbors=97 ..................................................
[CV] ................................... n_neighbors=94, total= 1.5min
[CV] n_neighbors=100 .................................................
[CV] ................................... n_neighbors=97, total= 1.5min
[CV] n_neighbors=100 .................................................
[CV] ................................... n_neighbors=97, total= 1.4min
[CV] n_neighbors=100 .................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed: 99.6min finished


{'n_neighbors': 25} KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=25, p=2,
           weights='uniform') [[3799 1402]
 [1851 4469]]              precision    recall  f1-score   support

          0       0.67      0.73      0.70      5201
          1       0.76      0.71      0.73      6320

avg / total       0.72      0.72      0.72     11521



## XG Boost

In [47]:
model = XGBClassifier()
param_grid = [{'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic','reg:linear'],
              'learning_rate': [0.05,0.1], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7,0.8],
              'n_estimators': [5,500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [0,1337]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8 
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8 
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8 
[CV] seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8, total=   1.6s
[CV] seed=1

[CV]  seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total= 1.6min
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total= 1.6min
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total= 1.6min
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nt

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.7min


[CV]  seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8, total=   1.7s
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.1, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8, total=   1.9s
[CV] seed=0, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.1, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=1337, colsample_bytree=0.7, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8, total=   1.6s
[CV] seed=1337, colsample_bytree=0.7, max_depth=6, n

[CV]  seed=0, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total=   1.3s
[CV] seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8 
[CV]  seed=0, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total=   1.3s
[CV] seed=0, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=500, learning_rate=0.05, silent=1, objective=binary:logistic, subsample=0.8 
[CV]  seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.05, silent=1, objective=reg:linear, subsample=0.8, total=   1.2s
[CV] seed=0, colsample_bytree=0.8, max_depth=6, nthread=4

[CV]  seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=binary:logistic, subsample=0.8, total=   1.4s
[CV] seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8 
[CV]  seed=0, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8, total=   1.3s
[CV] seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8 
[CV]  seed=0, colsample_bytree=0.8, max_depth=6, nthread=4, min_child_weight=11, missing=-999, n_estimators=5, learning_rate=0.1, silent=1, objective=reg:linear, subsample=0.8, total=   1.4s
[CV] seed=1337, colsample_bytree=0.8, max_depth=6, nthread=4,

[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 18.4min finished


{'seed': 0, 'colsample_bytree': 0.7, 'max_depth': 6, 'nthread': 4, 'min_child_weight': 11, 'missing': -999, 'n_estimators': 500, 'learning_rate': 0.05, 'subsample': 0.8, 'objective': 'reg:linear', 'silent': 1} XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.8) [[4062 1139]
 [1411 4909]]              precision    recall  f1-score   support

          0       0.74      0.78      0.76      5201
          1       0.81      0.78      0.79      6320

avg / total       0.78      0.78      0.78     11521

