# Predicting the Success of a Kickstarter Campaign
Utilized GridSearchCV to optimize model parameters.

In [13]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# %matplotlib inline
# import seaborn as sns

from sqlalchemy import create_engine

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler 

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore")

In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
# df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
df['idx'] = df['id']
df = df.set_index('idx')
df = df.drop(['index'],1)
print(df.shape)
pd.read_sql_query('''SELECT state, category_main, category_name, backers_count, pct_goal_achieved, usd_pledged, usd_goal, country, currency, campaign_length, staff_pick, blurb_length, launched FROM kickstarter_data_ds2 LIMIT 5''',engine)

(163425, 19)


Unnamed: 0,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,usd_goal,country,currency,campaign_length,staff_pick,blurb_length,launched
0,1,fashion,Fashion,30,1.0,1555.0,1500.0,US,USD,31,0,25,2011-12-01
1,0,art,Painting,1,0.0,120.0,3456.0,US,USD,46,0,24,2011-11-16
2,1,film_and_video,Shorts,43,1.5,1831.0,1200.0,US,USD,25,0,18,2011-12-07
3,1,film_and_video,Shorts,6,1.0,1010.0,1000.0,US,USD,30,0,25,2011-12-02
4,1,film_and_video,Shorts,20,1.1,1580.0,1500.0,US,USD,27,0,24,2011-12-05


In [6]:
start_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d").date()

df = df[df['launched'] >= start_date] # filter from start date to current
df = df[['state','category_main','category_name','backers_count','usd_goal','country','currency','campaign_length',
        'staff_pick','blurb_length']]
df = df.dropna()

df.shape

(38401, 10)

In [7]:
df.columns

Index(['state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object')

In [8]:
df_dummies = pd.get_dummies(df[['category_main','category_name', 'country','currency']],drop_first=True)
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
print(df.shape)
print(df.columns)
df.info()

(38401, 197)
Index(['category_main_comics', 'category_main_crafts', 'category_main_dance',
       'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       'category_main_music',
       ...
       'state', 'category_main', 'category_name', 'backers_count', 'usd_goal',
       'country', 'currency', 'campaign_length', 'staff_pick', 'blurb_length'],
      dtype='object', length=197)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 38401 entries, 1396766240 to 1094069811
Columns: 197 entries, category_main_comics to blurb_length
dtypes: float64(1), int64(5), object(4), uint8(187)
memory usage: 11.3+ MB


In [9]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['state','category_main','category_name', 'backers_count', 'country', 'currency'], 1)
y = df['state']

print(X.shape)
X.columns
# X.head()

(38401, 191)


Index(['category_main_comics', 'category_main_crafts', 'category_main_dance',
       'category_main_design', 'category_main_fashion',
       'category_main_film_and_video', 'category_main_food',
       'category_main_games', 'category_main_journalism',
       'category_main_music',
       ...
       'currency_MXN', 'currency_NOK', 'currency_NZD', 'currency_SEK',
       'currency_SGD', 'currency_USD', 'usd_goal', 'campaign_length',
       'staff_pick', 'blurb_length'],
      dtype='object', length=191)

## Assumptions

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(26880, 191) (26880,)
(11521, 191) (11521,)


In [11]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

# Phase 3: GridSearchCV

## GridSearchCV: Logistic Regression

In [None]:
model = LogisticRegression()
param_grid = [{'penalty': ['l1','l2'],'C': [0.001,0.01,0.1,1,10,100,1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l1 .............................................
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   1.3s
[CV] .............................. C=0.001, penalty=l1, total=   1.4s
[CV] C=0.001, penalty=l2 .............................................
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   1.5s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   2.2s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .............................. C=0.001, penalty=l2, total=   1.4s
[CV] ...........

## GridSearchCV: Naive Bayes

In [None]:
model = BernoulliNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

In [None]:
model = MultinomialNB()
param_grid = [{'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: Random Forest

In [None]:
model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: Gradient Boost

In [None]:
model = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: AdaBoost

In [None]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
param_grid = [{"base_estimator__criterion" : ["gini", "entropy"],
               "base_estimator__splitter" :   ["best", "random"],
               'n_estimators': [1, 2],
               'base_estimator__max_depth': [1, 2],
               'algorithm': ['SAMME', 'SAMME.R']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## GridSearchCV: KNN

In [None]:
# https://stackoverflow.com/questions/37678471/i-am-trying-to-implement-gridsearchcv-to-tune-the-parameters-of-k-nearest-neighb
model = KNeighborsClassifier()
param_grid = [{'n_neighbors': list(range(1,101,2))}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## XG Boost

In [None]:
model = XGBClassifier()
param_grid = [{'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic','reg:linear'],
              'learning_rate': [0.05,0.1], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7,0.8],
              'n_estimators': [5,500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [0,1337]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train,y_train)
grid_predictions = grid.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

## SVM

In [None]:
model = SVC()
param_grid = [{'C': [0.001, 0.01, 0.1, 1, 10],'gamma':[0.001, 0.01, 0.1, 1]}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train_s,y_train)
grid_predictions = grid.predict(X_test_s)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)