# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder

import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df['idx'] = df['id']
df = df.set_index('idx')
# df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
print(df.shape)
# pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data_ds2 LIMIT 5''',engine)
df.head()

(163426, 18)


Unnamed: 0_level_0,id,name,state,category_name,backers_count,pct_goal_achieved,usd_pledged,goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1725323227,1725323227,Inspire young girls,failed,Childrenswear,1,2.3,30.0,1300.0,US,USD,30,2018-01-12,2017-12-13,2017-12-08,0,0,Rayna,6
2065169465,2065169465,Cotton-Top Pastries,successful,Small Batch,99,131.4,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,1,Holly Weist,5
1516902916,1516902916,Dreaming Creek Brewery,failed,Drinks,64,30.7,6139.0,20000.0,US,USD,30,2018-01-12,2017-12-13,2017-08-11,0,0,Mike Bradley,19
1396766240,1396766240,Ripple Playing Cards - Printed by USPCC,failed,Playing Cards,131,33.9,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,0,B.Y. Eidelman,16
1361347175,1361347175,New Boutique Funding for the San Antonio Stock...,failed,Ready-to-wear,0,0.0,0.0,5000.0,US,USD,15,2018-01-12,2017-12-28,2017-12-22,0,0,Darrian Fosty,26


In [6]:
df.columns

Index(['id', 'name', 'state', 'category_name', 'backers_count',
       'pct_goal_achieved', 'usd_pledged', 'goal', 'country', 'currency',
       'campaign_length', 'deadline', 'launched', 'created', 'spotlight',
       'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')

In [7]:
df_state = df.groupby(["state"]).count().sort_values(['name'], ascending=[False])
df_state = pd.DataFrame(df_state, columns={'name'})
print(df_state.shape)
df_state

(2, 1)


Unnamed: 0_level_0,name
state,Unnamed: 1_level_1
successful,91697
failed,71729


In [8]:
df_state = df.groupby(["state"]).sum().sort_values(['usd_pledged'], ascending=[False])
df_state = pd.DataFrame(df_state, columns={'usd_pledged','goal'})
print(df_state.shape)
df_state

(2, 2)


Unnamed: 0_level_0,goal,usd_pledged
state,Unnamed: 1_level_1,Unnamed: 2_level_1
successful,918112700.0,1795628000.0
failed,6299258000.0,83344060.0


In [9]:
df_country = df.groupby(["country","state"]).count().sort_values(['name'], ascending=[False])
df_country = pd.DataFrame(df_country, columns={'name'})
print(df_country.shape)
df_country

(44, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,name
country,state,Unnamed: 2_level_1
US,successful,77075
US,failed,53506
GB,successful,7164
GB,failed,6602
CA,failed,3341
CA,successful,2683
AU,failed,1938
AU,successful,1208
DE,failed,984
NL,failed,836


In [10]:
df_country = df.groupby(["country","state"]).sum().sort_values(['usd_pledged'], ascending=[False])
df_country = pd.DataFrame(df_country, columns={'usd_pledged','goal'})
print(df_country.shape)
df_country

(44, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,goal,usd_pledged
country,state,Unnamed: 2_level_1,Unnamed: 3_level_1
US,successful,725966400.0,1548815000.0
GB,successful,41101440.0,109512000.0
US,failed,4136627000.0,62211060.0
CA,successful,24175360.0,40878840.0
AU,successful,11505290.0,20757650.0
DE,successful,6840740.0,18933980.0
FR,successful,5569647.0,10674290.0
NL,successful,4043202.0,9331891.0
GB,failed,318072900.0,8969440.0
ES,successful,2481130.0,4895142.0


In [11]:
df_main_category = df.groupby(["category_name","state"]).count().sort_values(['name'], ascending=[False])
df_main_category = pd.DataFrame(df_main_category, columns={'name'})
print(df_main_category.shape)
df_main_category

(296, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,name
category_name,state,Unnamed: 2_level_1
Web,failed,2987
Indie Rock,successful,2389
Country & Folk,successful,2381
Rock,successful,2368
Shorts,successful,2358
Children's Books,successful,2354
Documentary,successful,2340
Theater,successful,2313
Video Games,successful,2310
Tabletop Games,successful,2261


In [12]:
df_main_category = df.groupby(["category_name","state"]).sum().sort_values(['usd_pledged'], ascending=[False])
df_main_category = pd.DataFrame(df_main_category, columns={'usd_pledged', 'goal'})
print(df_main_category.shape)
df_main_category

(296, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,goal,usd_pledged
category_name,state,Unnamed: 2_level_1,Unnamed: 3_level_1
Video Games,successful,88558915.00,1.823571e+08
Hardware,successful,43489196.79,1.372346e+08
Product Design,successful,38914451.50,1.270897e+08
Tabletop Games,successful,21391073.00,1.047053e+08
Gadgets,successful,43482709.00,8.520704e+07
Wearables,successful,15674322.00,6.167573e+07
Narrative Film,successful,35415348.30,4.457362e+07
Technology,successful,12915105.00,4.249732e+07
3D Printing,successful,9556290.00,4.226102e+07
Sound,successful,10724128.00,3.970289e+07


In [13]:
df_spotlight = df.groupby(["spotlight","state"]).count().sort_values(['name'], ascending=[False])
df_spotlight = pd.DataFrame(df_spotlight, columns={'name'})
print(df_spotlight.shape)
df_spotlight

(2, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,name
spotlight,state,Unnamed: 2_level_1
1,successful,91697
0,failed,71729


In [14]:
df_spotlight = df.groupby(["spotlight","state"]).sum().sort_values(['usd_pledged'], ascending=[False])
df_spotlight = pd.DataFrame(df_spotlight, columns={'usd_pledged', 'goal'})
print(df_spotlight.shape)
df_spotlight

(2, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,goal,usd_pledged
spotlight,state,Unnamed: 2_level_1,Unnamed: 3_level_1
1,successful,918112700.0,1795628000.0
0,failed,6299258000.0,83344060.0


In [15]:
df_dummies = pd.get_dummies(df[['state', 'category_name', 'country']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.head()

(163430, 198)
Index(['state_successful', 'category_name_Academic',
       'category_name_Accessories', 'category_name_Action',
       'category_name_Animals', 'category_name_Animation',
       'category_name_Anthologies', 'category_name_Apparel',
       'category_name_Apps', 'category_name_Architecture',
       ...
       'country', 'currency', 'campaign_length', 'deadline', 'launched',
       'created', 'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object', length=198)


Unnamed: 0_level_0,state_successful,category_name_Academic,category_name_Accessories,category_name_Action,category_name_Animals,category_name_Animation,category_name_Anthologies,category_name_Apparel,category_name_Apps,category_name_Architecture,...,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18520,0,0,0,0,0,0,0,0,0,0,...,US,USD,30,2016-11-18,2016-10-19,2016-10-19,0,0,Devan Porter,24
21109,1,0,0,0,0,0,0,0,0,0,...,GB,GBP,29,2015-05-06,2015-04-07,2015-04-07,1,0,Fiona Powell,24
24380,0,0,0,0,0,0,0,0,0,0,...,US,USD,30,2015-11-26,2015-10-27,2015-10-25,0,0,Tori,24
33867,0,0,0,0,0,0,0,0,0,0,...,CA,CAD,45,2015-07-30,2015-06-15,2015-03-15,0,1,Taste Makers By Trish P,23
39036,1,0,0,0,0,0,0,0,0,0,...,US,USD,45,2016-06-30,2016-05-16,2016-05-15,1,0,Mark Bechard,13


In [16]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state_successful','id', 'name', 'state', 'category_name', 'backers_count','pct_goal_achieved', 'usd_pledged', 'goal', 'country', 'currency',
       'deadline', 'launched', 'created','creator_name','spotlight'], 1)
y = df['state_successful']

print(X.shape)
print(X.columns)
X.head()

(163430, 182)
Index(['category_name_Academic', 'category_name_Accessories',
       'category_name_Action', 'category_name_Animals',
       'category_name_Animation', 'category_name_Anthologies',
       'category_name_Apparel', 'category_name_Apps',
       'category_name_Architecture', 'category_name_Art',
       ...
       'country_MX', 'country_NL', 'country_NO', 'country_NZ', 'country_SE',
       'country_SG', 'country_US', 'campaign_length', 'staff_pick',
       'blurb_length'],
      dtype='object', length=182)


Unnamed: 0_level_0,category_name_Academic,category_name_Accessories,category_name_Action,category_name_Animals,category_name_Animation,category_name_Anthologies,category_name_Apparel,category_name_Apps,category_name_Architecture,category_name_Art,...,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US,campaign_length,staff_pick,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,30,0,24
21109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,29,0,24
24380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,30,0,24
33867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,45,1,23
39036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,45,0,13


## Assumptions

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(114401, 182) (114401,)
(49029, 182) (49029,)


In [None]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
print(X_train_s.mean(axis=0))
print(X_train_s.std(axis=0))

# X_combined_s = np.vstack((X_train_s, X_test_s))
# y_combined = np.hstack((y_train, y_test))

[-2.18005525e-17 -1.05586721e-17  5.34144590e-18 -1.09313312e-17
  8.10533361e-18  9.93757377e-19 -1.96577631e-17  2.40365066e-17
  4.96878689e-19 -1.24219672e-19 -3.47504533e-17  9.02145369e-18
  3.35393115e-18 -2.62724607e-17  1.31672852e-17  1.83845115e-17
  0.00000000e+00 -4.09924918e-18  3.64584738e-17 -2.73283279e-18
 -2.67072295e-18 -2.01235869e-17 -2.52787033e-17 -2.63345705e-17
  1.00617934e-17  4.59612787e-18 -2.27943098e-17 -2.39743967e-17
 -5.51535344e-17 -1.49684705e-17 -2.48439344e-18 -6.73891721e-18
  2.34154082e-17 -7.45318033e-19 -9.99968361e-18  1.16766492e-17
  7.45318033e-18  4.72034754e-18  3.79491098e-17  2.75767672e-17
  2.14900033e-17 -1.25461869e-17  9.31647541e-19  4.61476082e-17
  4.34768853e-18  4.47190820e-18 -1.73286443e-17  1.33536148e-17
  7.70161967e-18 -1.11797705e-17  1.86329508e-17  3.65205836e-17
 -5.83832459e-18 -1.87571705e-17  3.37256410e-17 -3.01853803e-17
  8.22955328e-18  7.21716295e-17  1.36641639e-17 -4.09924918e-17
  3.88807574e-17  3.35393

## Phase 1: Initial Check on All Models

In [None]:
# Define models to test
model_list = [['GaussianNB', GaussianNB()], 
                ['BernoulliNB', BernoulliNB()], # F1 0.60
                ['MultinomialNB', MultinomialNB()], 
                ['DecisionTree', DecisionTreeClassifier()], 
                ['RandomForest', RandomForestClassifier()], # F1 0.61
                ['KNN', KNeighborsClassifier(38)], # best k from KNN model below; scale data
                ['LogisticRegression', LogisticRegression()], 
                ['LogisticRegressionCV', LogisticRegressionCV()], # scale data; F1 0.62
                ['SVM', SVC(probability=True)]] # scale data; F1 0.57

model_list_s = [['KNN', KNeighborsClassifier(38)], # best k from KNN model below; scale data
                ['LogisticRegression', LogisticRegression()], # scale data
                ['LogisticRegressionCV', LogisticRegressionCV()], # scale data
                ['SVM', SVC(probability=True)]] # scale data

# Calculate metrics for each model
roc = {}
for model in model_list:
    if model in model_list_s:
        X_train = X_train_s
        X_test = X_test_s
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
    error_rate = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=42, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred))
        error_rate.append(np.mean(y_pred != y_test))
        
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score)
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))  

GaussianNB(priors=None)
	Accuracy: 0.7008301209488262
	AUC: 0.7313718854525506


[[21117   402]
 [14266 13244]]
             precision    recall  f1-score   support

          0       0.60      0.98      0.74     21519
          1       0.97      0.48      0.64     27510

avg / total       0.81      0.70      0.69     49029

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
	Accuracy: 0.7664851414468987
	AUC: 0.772375096178098


[[17658  3861]
 [ 7588 19922]]
             precision    recall  f1-score   support

          0       0.70      0.82      0.76     21519
          1       0.84      0.72      0.78     27510

avg / total       0.78      0.77      0.77     49029

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
	Accuracy: 0.7739093189744845
	AUC: 0.7766075786136701


[[17187  4332]
 [ 6753 20757]]
             precision    recall  f1-score   support

          0       0.72      0.80      0.76     21519
          1       0.83      0.75      0.79    

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(12, 8))
colors = ['b', 'g', 'r', 'c', 'm', 'y','k','orange','darkorchid']

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')
# Plot Classifier ROC Curves
for key, c in zip(roc.keys(), colors):
    ax.plot(roc[key][0], roc[key][1], color=c, label=key)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('Classifier Comparison')
ax.legend(loc='best')
plt.savefig('charts/roc.png')

## GridSearchCV

In [None]:
model_list = [[SVC(), LogisticRegression(), RandomForestClassifier()]]

for model in model_list:
    
    model = model
    
    param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear','poly','rbf'], 'gamma': [0.001, 0.0001]}]
    grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train_s,y_train)
    grid_predictions = grid.predict(X_test_s)
    print(grid.best)
    print(confusion_matrix(y_test,grid_predictions))
    print(classification_report(y_test,grid_predictions))

In [None]:
grid.best

In [None]:
grid_predictions = grid.predict(X_test_s)

In [None]:
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))