# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder

import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
# df = pd.read_pickle('data/kickstarter_data.pkl')
df = pd.read_sql_query('''SELECT * FROM kickstarter_data''',engine)
print(df.shape)
pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data LIMIT 5''',engine)

(19291, 67)


Unnamed: 0,state,main_category,main_category.1,currency,currency.1,deadline,launched,usd_goal_real,usd_pledged_real
0,successful,Publishing,Publishing,USD,USD,2018-01-02,2017-12-06,2000.0,6083.0
1,successful,Music,Music,USD,USD,2018-01-02,2017-11-30,10000.0,11169.56
2,successful,Music,Music,EUR,EUR,2018-01-02,2017-11-28,30112.5,30615.02
3,successful,Music,Music,USD,USD,2018-01-02,2017-12-09,1000.0,1743.0
4,failed,Food,Food,USD,USD,2018-01-02,2017-11-03,200000.0,1.0


In [6]:
df.columns

Index(['idx', 'state_successful', 'main_category_Comics',
       'main_category_Crafts', 'main_category_Dance', 'main_category_Design',
       'main_category_Fashion', 'main_category_Film & Video',
       'main_category_Food', 'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'country_AU', 'country_BE', 'country_CA',
       'country_CH', 'country_DE', 'country_DK', 'country_ES', 'country_FR',
       'country_GB', 'country_HK', 'country_IE', 'country_IT', 'country_JP',
       'country_LU', 'country_MX', 'country_NL', 'country_NO', 'country_NZ',
       'country_SE', 'country_SG', 'country_US', 'currency_CAD',
       'currency_CHF', 'currency_DKK', 'currency_EUR', 'currency_GBP',
       'currency_HKD', 'currency_JPY', 'currency_MXN', 'currency_NOK',
       'currency_NZD', 'currency_SEK', 'currency_SGD', 'currency_USD', 'ID',
       'nam

In [7]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
X = df.drop(['idx','ID','name', 'category', 'main_category', 'currency','pledged', 
             'state','country', 'usd pledged','deadline','launched','state_successful',
             'backers', 'usd_pledged_real','campaign_length','pct_goal_achieved','goal'], 1)
# X = df[['usd_goal_real','campaign_length']]
y = df['state_successful']

print(X.shape)
X.columns

(19291, 49)


Index(['main_category_Comics', 'main_category_Crafts', 'main_category_Dance',
       'main_category_Design', 'main_category_Fashion',
       'main_category_Film & Video', 'main_category_Food',
       'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'country_AU', 'country_BE', 'country_CA',
       'country_CH', 'country_DE', 'country_DK', 'country_ES', 'country_FR',
       'country_GB', 'country_HK', 'country_IE', 'country_IT', 'country_JP',
       'country_LU', 'country_MX', 'country_NL', 'country_NO', 'country_NZ',
       'country_SE', 'country_SG', 'country_US', 'currency_CAD',
       'currency_CHF', 'currency_DKK', 'currency_EUR', 'currency_GBP',
       'currency_HKD', 'currency_JPY', 'currency_MXN', 'currency_NOK',
       'currency_NZD', 'currency_SEK', 'currency_SGD', 'currency_USD',
       'usd_goal_real'],
      dtype='object

## Assumptions

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(13503, 49) (13503,)
(5788, 49) (5788,)


In [9]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
print(X_train_s.mean(axis=0))
print(X_train_s.std(axis=0))

X_combined_s = np.vstack((X_train_s, X_test_s))
y_combined = np.hstack((y_train, y_test))

[-7.02491707e-17  4.57803584e-17  1.05242203e-17  4.52541474e-17
  3.52561381e-17  3.57823491e-17 -2.10484407e-18 -1.63125415e-17
  3.57823491e-17  1.99960186e-17 -1.68387525e-17 -1.10504313e-17
  1.84173856e-17  4.26230923e-17  2.99940279e-17  3.20988720e-17
  3.63085601e-17 -3.15726610e-18  3.15726610e-18  1.47339085e-17
  1.26290644e-17 -5.52521567e-18  1.02611148e-17 -1.68387525e-17
 -1.99960186e-17  5.26211017e-19  1.68387525e-17  9.47179830e-18
  1.10504313e-17 -2.15746517e-17  2.59158926e-17  1.26290644e-17
 -1.26290644e-17 -1.15766424e-17 -1.14977107e-16  3.63085601e-17
 -3.15726610e-18  1.47339085e-17  3.42037161e-17  1.02611148e-17
 -1.68387525e-17  1.68387525e-17  1.10504313e-17  2.59158926e-17
  1.26290644e-17 -1.26290644e-17 -1.15766424e-17 -1.14977107e-16
 -2.10484407e-18]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1.]


## Phase 2: Deeper Dive on Select Models

## Logistic Regression

In [13]:
# pythonml p71
lr = LogisticRegressionCV(random_state=42)
lr.fit(X_train_s, y_train)
predictions = lr.predict(X_test_s)
print(classification_report(y_test,predictions))


             precision    recall  f1-score   support

          0       0.66      0.72      0.69      3277
          1       0.59      0.51      0.55      2511

avg / total       0.63      0.63      0.63      5788



## Random Forests

In [27]:
# rf_range = list(range(1,2000,100))
# model = ['RandomForest', RandomForestClassifier()]

accuracy = []
precision = []
recall = []
f1 = []
auc = []
error_rate = []

# for rf in rf_range:
# rfc = RandomForestClassifier(n_estimators=1000)
# rfc.fit(X_train,y_train)
# predictions = rfc.predict(X_test)
# print(classification_report(y_test,predictions))
# print(confusion_matrix(y_test,predictions))
# print(roc_auc_score(y_test,predictions))

kf = KFold(5, random_state=4444, shuffle=True) 
for train_idx, test_idx in kf.split(X, y=y):
    model = RandomForestClassifier(n_estimators=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    auc.append(roc_auc_score(y_test, y_pred))
    error_rate.append(np.mean(y_pred != y_test))

# Calculate mean metric across K-folds
mean_accuracy = np.mean(accuracy)
mean_precision = np.mean(precision)
mean_recall = np.mean(recall)
mean_f1 = np.mean(f1)
mean_auc = np.mean(auc)

# Capture TPR and FPR from last fold for plotting
y_score = model.predict_proba(X_test)[:,1]
roc[model_name] = roc_curve(y_test, y_score)

# Print formatted results
print(model)
print('\t==============================')
print('\tAccuracy:', mean_accuracy)
print('\tAUC:', mean_auc)
print('\n')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.6044920525224603
	AUC: 0.5905433851201191


[[2276 1001]
 [1296 1215]]
             precision    recall  f1-score   support

          0       0.64      0.69      0.66      3277
          1       0.55      0.48      0.51      2511

avg / total       0.60      0.60      0.60      5788



In [22]:
# Define models to test
model_list = [
#                 ['GaussianNB', GaussianNB()], 
#                 ['BernoulliNB', BernoulliNB()], # F1 0.60
#                 ['MultinomialNB', MultinomialNB()], 
#                 ['DecisionTree', DecisionTreeClassifier()], 
                ['RandomForest', RandomForestClassifier()]] # F1 0.61
#                 ['KNN', KNeighborsClassifier(38)], # best k from KNN model below; scale data
#                 ['LogisticRegression', LogisticRegression()], 
#                 ['LogisticRegressionCV', LogisticRegressionCV()], # scale data; F1 0.62
#                 ['SVM', SVC(probability=True)]] # scale data; F1 0.57

# model_list_s = [['KNN', KNeighborsClassifier(38)], # best k from KNN model below; scale data
# #                 ['LogisticRegression', LogisticRegression()], # scale data
#                 ['LogisticRegressionCV', LogisticRegressionCV()], # scale data
#                 ['SVM', SVC(probability=True)]] # scale data

# Calculate metrics for each model
roc = {}
for model in model_list:
#     if model in model_list_s:
#         X_train = X_train_s
#         X_test = X_test_s
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
    error_rate = []

    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=4444, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred))
        error_rate.append(np.mean(y_pred != y_test))

    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    mean_auc = np.mean(auc)

    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score)

    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred)) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.6036627505183138
	AUC: 0.5893176280089304


[[2280  997]
 [1304 1207]]
             precision    recall  f1-score   support

          0       0.64      0.70      0.66      3277
          1       0.55      0.48      0.51      2511

avg / total       0.60      0.60      0.60      5788



## K-Nearest Neighbors KNN

In [None]:
k_acc = []
error_rate = []


for k in k_range:
    knn = KNeighborsClassifier(k)
    knn.fit(X_train_s, y_train)
    pred_k = knn.predict(X_test_s)
    k_acc.append(accuracy_score(y_test, pred_k))
    error_rate.append(np.mean(pred_k != y_test))
    
plt.figure(figsize=(16,8))
plt.plot(k_range, k_acc, '-', label = 'K Nearest Neighbor')
# plt.plot(k_range,error_rate, '-',label = 'Error Rate')
plt.title('Accuracy vs. Error')
plt.xlabel('# of neighbors in knn')
plt.ylabel('Accuracy / Error on Test Case')
plt.legend(loc = 0)

In [None]:
# TO DO get top 5 k values
best_k = k_range[np.argmax(k_acc)]
best_k

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k)

knn.fit(X_train_s,y_train)
pred = knn.predict(X_test_s)

print('WITH Best K')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
# plot_decision_regions(X=X_combined_s,
#                       y=y_combined,
#                       classifier=knn,
#                       test_idx=range(105, 150))
# plt.xlabel('petal length [standardized]')
# plt.ylabel('petal width [standardized]')
# plt.legend(loc='upper left')
# plt.show()  

## Logistic Regression

In [None]:
# vanilla case
# logmodel = LogisticRegression()
# logmodel.fit(X_train_s,y_train)

# predictions = logmodel.predict(X_test_s)
# print(classification_report(y_test,predictions))
# # logmodel.coef_
# logmodel.predict_proba(X_train_s)

In [None]:
# vanilla case
# logmodel = LogisticRegressionCV()
# logmodel.fit(X_train_s,y_train)

# predictions = logmodel.predict(X_test_s)
# print(classification_report(y_test,predictions))

In [None]:
# print(classification_report(y_test,predictions))

In [None]:
# Plot Training Examples vs. Accuracy
# fig, ax = plt.subplots(figsize=(10, 8))

# plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                  train_scores_mean + train_scores_std, alpha=0.1, color="r")
# plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                  test_scores_mean + test_scores_std, alpha=0.1, color="g")
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
# plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

# ax.set_title('Learning Curves (LogisticRegression)')
# ax.set_xlabel('Training Examples')
# ax.set_ylabel('Accuracy')
# ax.legend(loc='best');

In [None]:
logreg = LogisticRegression()
log_accuracy = cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
log_accuracy

In [None]:
train_sizes, train_scores, test_scores = learning_curve(LogisticRegression(), 
                                                        X, y,
                                                        cv=10, 
                                                        scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
# predictions = logmodel.predict(X_test)

In [None]:
# print(classification_report(y_test,predictions))

## SVM

In [None]:
svc_model = SVC()

In [None]:
svc_model.fit(X_train,y_train)

In [None]:
predictions = svc_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

## Decision Trees

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions = dtree.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

## Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators=600)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
pipeline = make_pipeline(StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [None]:
pipeline

## Naive Bayes

In [None]:
model = naive_bayes.GaussianNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = naive_bayes.BernoulliNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = naive_bayes.MultinomialNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))


In [None]:
X_train = scale(X_train)
X_test = scale(X_test)

model = LinearSVC()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = SVC()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

## Gradient Boosting

## Ensemble

## Classification Errors

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
# Find the weighted precision and recall
print('Weighted Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Weighted Recall:', recall_score(y_test, y_pred, average='weighted'))

In [None]:
print(classification_report(y_test, y_pred)) #, target_names=target_names))

In [None]:
logreg = OneVsRestClassifier(LogisticRegression())
logreg.fit(X_train, y_train)
y_score = logreg.decision_function(X_test)

In [None]:
print('Micro Average Precision:', average_precision_score(y_test, y_score, average='micro'))
print('Macro Average Precision:', average_precision_score(y_test, y_score, average='macro'))
print('Weighted Average Precision:', average_precision_score(y_test, y_score, average='weighted'))
print('Precision for each Class:', average_precision_score(y_test, y_score, average=None))

In [None]:
precision, recall, _ = precision_recall_curve(y_test.ravel(), y_score.ravel())

In [None]:
precision

In [None]:
recall

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# y_score = logreg.predict_proba(X_test)[:,1]
# # print(y_score)
# # print(y_test)
# fpr, tpr, _ = roc_curve(y_test, y_score)
# # print(fpr)
# # print(tpr)
# roc_auc = auc([fpr, tpr])

In [None]:
# fig, ax = plt.subplots(figsize=(12, 8))

# # Plot 50-50 Line
# ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')
# # Plot Logistic ROC curve
# ax.plot(fpr, tpr, color='b', label='Logistic: %.3f' % roc_auc)
    
# ax.set_xlabel('FPR')
# ax.set_ylabel('TPR')
# ax.set_title('Logistic ROC Curve')
# ax.legend(loc='best');