# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

import datetime
import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df['idx'] = df['id']
df = df.set_index('idx')
# df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
# df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
print(df.shape)
# pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data_ds2 LIMIT 5''',engine)
df.head()

(163426, 19)


Unnamed: 0_level_0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1725323227,1725323227,Inspire young girls,failed,fashion,Childrenswear,1,0.0,30.0,1300.0,US,USD,30,2018-01-12,2017-12-13,2017-12-08,0,0,Rayna,6
2065169465,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,1,Holly Weist,5
1516902916,1516902916,Dreaming Creek Brewery,failed,food,Drinks,64,0.3,6139.0,20000.0,US,USD,30,2018-01-12,2017-12-13,2017-08-11,0,0,Mike Bradley,19
1396766240,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,0,B.Y. Eidelman,16
1361347175,1361347175,New Boutique Funding for the San Antonio Stock...,failed,fashion,Ready-to-wear,0,0.0,0.0,5000.0,US,USD,15,2018-01-12,2017-12-28,2017-12-22,0,0,Darrian Fosty,26


In [6]:
start_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d").date()

df = df[df['launched'] >= start_date] # filter from start date to current
df.shape

(41894, 19)

In [7]:
df.columns

Index(['id', 'name', 'state', 'category_main', 'category_name',
       'backers_count', 'pct_goal_achieved', 'usd_pledged', 'goal', 'country',
       'currency', 'campaign_length', 'deadline', 'launched', 'created',
       'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')

In [8]:
df_dummies = pd.get_dummies(df[['state','category_name', 'country']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.info()

(41896, 180)
Index(['state_successful', 'category_name_Academic',
       'category_name_Accessories', 'category_name_Action',
       'category_name_Animals', 'category_name_Animation',
       'category_name_Anthologies', 'category_name_Apparel',
       'category_name_Apps', 'category_name_Architecture',
       ...
       'country', 'currency', 'campaign_length', 'deadline', 'launched',
       'created', 'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object', length=180)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41896 entries, 18520 to 2147422173
Columns: 180 entries, state_successful to blurb_length
dtypes: float64(3), int64(6), object(10), uint8(161)
memory usage: 12.8+ MB


In [9]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state_successful','id', 'name', 'state', 'category_main','category_name', 'backers_count','pct_goal_achieved', 'usd_pledged', 'goal', 'country', 'currency',
       'deadline', 'launched', 'created','creator_name','spotlight'], 1)
y = df['state_successful']

print(X.shape)
X.columns
# X.head()

(41896, 163)


Index(['category_name_Academic', 'category_name_Accessories',
       'category_name_Action', 'category_name_Animals',
       'category_name_Animation', 'category_name_Anthologies',
       'category_name_Apparel', 'category_name_Apps',
       'category_name_Architecture', 'category_name_Art Books',
       ...
       'country_MX', 'country_NL', 'country_NO', 'country_NZ', 'country_SE',
       'country_SG', 'country_US', 'campaign_length', 'staff_pick',
       'blurb_length'],
      dtype='object', length=163)

## Assumptions

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(29327, 163) (29327,)
(12569, 163) (12569,)


In [11]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
# print(X_train_s.mean(axis=0))
# print(X_train_s.std(axis=0))

# X_combined_s = np.vstack((X_train_s, X_test_s))
# y_combined = np.hstack((y_train, y_test))

[ 3.56155700e-17  1.01758771e-17 -9.69131157e-19  1.84134920e-17
 -4.09457914e-17  1.73232194e-17 -1.69597952e-18  9.44902878e-18
  1.18718567e-17  6.29935252e-18 -3.31927421e-17 -1.57483813e-17
 -1.06604427e-17  2.95585003e-17 -1.12661497e-17 -3.87652463e-18
  4.07035086e-17  8.23761483e-18 -1.16295739e-17  4.84565578e-18
 -9.20674599e-18  4.94256890e-17 -7.26848367e-18  1.16295739e-17
  5.69364555e-18  1.93826231e-17 -1.04181599e-17 -1.04181599e-17
  8.72218041e-18  3.05276314e-17  2.90739347e-18 -3.87652463e-17
 -1.58695227e-17 -1.67175125e-17 -2.03517543e-17  5.18485169e-17
 -5.81478694e-18  1.93826231e-18 -1.01758771e-17 -2.27745822e-17
 -9.20674599e-18  3.29504593e-17 -2.22900166e-17  1.04181599e-17
  8.72218041e-18  4.79719923e-17 -1.67175125e-17  1.67175125e-17
 -3.48887216e-17 -2.42282789e-18  2.83470863e-17  9.69131157e-19
 -1.67175125e-17  2.01094715e-17  1.59906641e-17  1.25987050e-17
 -2.90739347e-18 -1.45369673e-18  4.28840537e-17  6.12975457e-17
  1.93826231e-17 -4.31263

## Phase 2: Deeper Dive: Logistic Regression, Naive Bayes, Random Forests?

In [None]:
# Define models to test
model_list = [  ['BernoulliNB', BernoulliNB(alpha=2.0, binarize=0.0, class_prior=None, fit_prior=True)], # F1 0.60
                ['MultinomialNB', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)], 
#                 ['DecisionTree', DecisionTreeClassifier()], 
                ['RandomForest', RandomForestClassifier(max_features='sqrt',n_estimators=1000)], # F1 0.61
                ['GradientBoost', GradientBoostingClassifier(n_estimators=500)],
                ['AdaBoost', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())],
#                 ['KNN', KNeighborsClassifier(38)], # best k from KNN model below; scale data
                ['LogisticRegression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
                      penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
                      verbose=0, warm_start=False)]]
#                 ['LogisticRegressionCV', LogisticRegressionCV()]] # scale data; F1 0.62
#                 ['SVM', SVC(probability=True)]] # scale data; F1 0.57

model_list_s = [['LogisticRegression', LogisticRegression()]] # scale data
#                 ['LogisticRegressionCV', LogisticRegressionCV()]] # scale data
#                 ['SVM', SVC(probability=True)]] # scale data

# Calculate metrics for each model
roc = {}
for model in model_list:
    if model in model_list_s:
        X_train = X_train_s
        X_test = X_test_s
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
    error_rate = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=42, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred))
        error_rate.append(np.mean(y_pred != y_test))
        
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score)
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred)) 

In [None]:
# roc

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(10, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y','k','orange','darkorchid','bisque']

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')
# Plot Classifier ROC Curves
for key, c in zip(roc.keys(), colors):
    ax.plot(roc[key][0], roc[key][1], color=c, label=key)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('Classifier Comparison')
ax.legend(loc='best')
plt.savefig('charts/roc_ds2_p2.png')

## GridSearchCV: Logistic Regression

In [None]:
model = LogisticRegression()
param_grid = [{'penalty': ['l1','l2'],'C': [0.001,0.01,0.1,1,10,100,1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: Naive Bayes

In [None]:
model = BernoulliNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

In [None]:
model = MultinomialNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: Random Forest

In [None]:
model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: Gradient Boost

In [None]:
model = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: AdaBoost

In [None]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
param_grid = [{'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2),
                  'algorithm': ('SAMME', 'SAMME.R')}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: KNN

In [None]:
# https://stackoverflow.com/questions/37678471/i-am-trying-to-implement-gridsearchcv-to-tune-the-parameters-of-k-nearest-neighb
model = KNeighborsClassifier()
param_grid = [{'n_neighbors': list(range(100))}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)