# Kickstarter: Classification
Tests Include:
-KNN
-Logistic Regression
-Decision Trees
-Random Forest
-SVM
-Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
%matplotlib inline

import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import label_binarize, scale, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [5]:
df = pd.read_pickle('data/kickstarter_data_ds2.pkl')
df['idx'] = df['id']
df = df.set_index('idx')
# df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
# df = pd.read_sql_query('''SELECT * FROM kickstarter_data_ds2''',engine)
print(df.shape)
# pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data_ds2 LIMIT 5''',engine)
print(df.columns)
df.head()

(163425, 18)
Index(['id', 'name', 'state', 'category_main', 'category_name',
       'backers_count', 'pct_goal_achieved', 'usd_pledged', 'usd_goal',
       'country', 'currency', 'campaign_length', 'deadline', 'launched',
       'created', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')


Unnamed: 0_level_0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,usd_goal,country,currency,campaign_length,deadline,launched,created,staff_pick,creator_name,blurb_length
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1396766240,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,B.Y. Eidelman,16
2065169465,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,Holly Weist,5
1647325451,1647325451,Code Switch,successful,film_and_video,Horror,34,1.5,4611.0,3000.0,US,USD,32,2018-01-12,2017-12-11,2017-11-10,0,Alba Roland,23
727157486,727157486,Rain Dog Farm,failed,food,Farms,49,0.3,4741.0,18000.0,US,USD,38,2018-01-12,2017-12-05,2017-11-28,1,Charlie Wainger,9
1756145145,1756145145,WANGTA: a novel,successful,publishing,Fiction,13,1.0,427.185132,427.185132,CA,CAD,21,2018-01-12,2017-12-22,2017-12-18,0,D. H. de Bruin,22


In [6]:
# start_date = datetime.datetime.strptime('2017-10-30', "%Y-%m-%d").date()

# df = df[df['launched'] >= start_date] # filter from start date to current
df = df[['state','backers_count','usd_goal','usd_pledged','campaign_length','staff_pick','blurb_length']]
df = df.dropna()

df.shape

(147802, 7)

In [7]:
df.columns

Index(['state', 'backers_count', 'usd_goal', 'usd_pledged', 'campaign_length',
       'staff_pick', 'blurb_length'],
      dtype='object')

In [8]:
# df_dummies = pd.get_dummies(df[['state','category_main','category_name', 'country','currency']],drop_first=True)
# df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)
# print(df.shape)
# print(df.columns)
# df.info()

In [9]:
# removing all variables that cannot be known before a campaign is launched, such as # backers and $ pledged
df = df.dropna()
df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
df_a = df.reset_index()

# scaler = StandardScaler().fit(df_a)
# df_a = scaler.transform(df_a)
# df = df.iloc[:,1:]

## Scenario 1: Clairvoyance

In [10]:
df = df_a[['state','usd_goal','usd_pledged','backers_count']]

df.to_csv('data/kickstarter_data_ds2_app1.csv',header=False, index=False)
print(df.shape)
df.head()

(147802, 4)


Unnamed: 0,state,usd_goal,usd_pledged,backers_count
0,0,9999.0,3387.0,131
1,1,7500.0,9858.0,99
2,1,3000.0,4611.0,34
3,0,18000.0,4741.0,49
4,1,427.185132,427.185132,13


In [11]:
X = df[['usd_pledged','usd_goal','backers_count']]
y = df['state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     18512
          1       1.00      1.00      1.00     30263

avg / total       1.00      1.00      1.00     48775



## Scenario 2: Numerical Features

In [12]:
df = df_a[['state','usd_goal','campaign_length','blurb_length']]
# df['state'] = df['state'].replace({'failed': 0, 'successful': 1})
# df = df.reset_index()
# df = df.iloc[:,1:]

df.to_csv('data/kickstarter_data_ds2_app2.csv',header=False, index=False)
print(df.shape)
df.head()

(147802, 4)


Unnamed: 0,state,usd_goal,campaign_length,blurb_length
0,0,9999.0,38,16
1,1,7500.0,30,5
2,1,3000.0,32,23
3,0,18000.0,38,9
4,1,427.185132,21,22


In [13]:
# X = df[['usd_pledged','usd_goal','backers_count']]
X = df[['usd_goal','campaign_length','blurb_length']]
y = df['state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      0.14      0.24     18512
          1       0.65      0.96      0.77     30263

avg / total       0.67      0.65      0.57     48775



## Scenario 3: Mix 'n Match

In [14]:
df = df_a[['state','usd_goal','backers_count','campaign_length']]

df.to_csv('data/kickstarter_data_ds2_app3.csv',header=False, index=False)
print(df.shape)
df.head()

(147802, 4)


Unnamed: 0,state,usd_goal,backers_count,campaign_length
0,0,9999.0,131,38
1,1,7500.0,99,30
2,1,3000.0,34,32
3,0,18000.0,49,38
4,1,427.185132,13,21


In [15]:
X = df[['usd_goal','backers_count','campaign_length']]
y = df['state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.91      0.89      0.90     18512
          1       0.93      0.95      0.94     30263

avg / total       0.93      0.93      0.93     48775



## Cancer Predictor

In [16]:
df = pd.read_csv('data/haberman.data')
df.head()

Unnamed: 0,30,64,1,1.1
0,30,62,3,1
1,30,65,0,1
2,31,59,2,1
3,31,65,4,1
4,33,58,10,1


In [17]:
df.columns = ['age','year','nodes','survived']
X = df[['age','year','nodes']]
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))
# logmodel.predict_proba(X_test)[:,1]

             precision    recall  f1-score   support

          1       0.76      0.93      0.84        74
          2       0.50      0.19      0.27        27

avg / total       0.69      0.73      0.69       101

