# Load Data

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

In [2]:
# reflect database table into classes
engine = create_engine("sqlite:///db/kickstarter_campaigns.sqlite", echo=False)

Base = automap_base()
Base.prepare(engine, reflect=True)
Kickstarter = Base.classes.kickstarter_campaigns

session = Session(engine)

In [3]:
data = engine.execute('SELECT * FROM kickstarter_campaigns')
kickstarter_df = pd.DataFrame()
record_list = []
for record in data:
    record_list.append(record)
    
kickstarter_df = pd.DataFrame(record_list)
kickstarter_df.columns = ['index','backers_count','blurb','country','currency','deadline','fx_rate','goal','id',
                          'name','pledged','slug','state','category_id','category_name','parent_category_id',
                          'creator_profile','creator_name','start_date','campaign_length','month_started','state_or_province',
                          'parent_category','blurb_length','blurb_sentiment_compound','blurb_sentiment_positive',
                          'blurb_sentiment_negative','blurb_sentiment_neutral','title_length','title_sentiment_compound',
                          'title_sentiment_positive','title_sentiment_negative','title_sentiment_neutral','usd_goal',
                          'pct_complete']

kickstarter_df = kickstarter_df.set_index('index')
print(f'{len(kickstarter_df)} rows of data loaded')
kickstarter_df.head()

188137 rows of data loaded


Unnamed: 0_level_0,backers_count,blurb,country,currency,deadline,fx_rate,goal,id,name,pledged,...,blurb_sentiment_positive,blurb_sentiment_negative,blurb_sentiment_neutral,title_length,title_sentiment_compound,title_sentiment_positive,title_sentiment_negative,title_sentiment_neutral,usd_goal,pct_complete
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,80,I will be an artist-in-residence at Elsewhere ...,US,USD,2012-04-19 15:16:00,1.0,2800.0,1562040083,Elsewhere Studios artist-in-residency program!,3596.0,...,0.0,0.0,1.0,46,0.0,0.0,0.0,1.0,2800.0,1.284286
2,47,We are looking to bring a Visiting Sculptor fr...,US,USD,2012-04-20 17:06:38,1.0,3900.0,1437561817,Martin Luther King Jr. Sculpture on Campus!,4117.0,...,0.084,0.0,0.916,43,0.0,0.0,0.0,1.0,3900.0,1.055641
3,80,Surrealistic oil paintings capturing the metam...,US,USD,2012-04-16 22:59:00,1.0,750.0,574125813,EMERGENCE: Surreal Oil Paintings by J.J. Long,3125.0,...,0.0,0.0,1.0,45,0.0,0.0,0.0,1.0,750.0,4.166667
4,82,1000 Artists is a public art-making installati...,US,USD,2012-05-07 20:22:25,1.0,4500.0,858990141,1000 Artists: Presidential Inauguration 2013,4586.0,...,0.0,0.0,1.0,44,0.0,0.0,0.0,1.0,4500.0,1.019111
5,31,P.M.A.F.T.W. my upcoming solo show June 2012 a...,US,USD,2012-04-02 21:57:23,1.0,1000.0,566704999,P.M.A.F.T.W.,1036.0,...,0.0,0.0,1.0,12,0.0,0.0,0.0,1.0,1000.0,1.036


# Create ML Models

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

Using TensorFlow backend.


In [15]:
X = kickstarter_df.drop(['blurb','country','currency','deadline','fx_rate','id','name','slug','state',
                                   'category_id','category_name','parent_category_id','creator_profile',
                                   'creator_name', 'start_date', 'month_started','state_or_province','parent_category','backers_count',
                                   'pledged', 'pct_complete', 'goal'
                                  ], axis=1)
              
X_categorical_data = [kickstarter_df['country'], kickstarter_df['currency'],kickstarter_df['category_name'],
                 kickstarter_df['parent_category'], kickstarter_df['month_started']]
              
y = kickstarter_df['state']

In [18]:
# FOR SOME REASON THERE WAS A PROBLEM WITH ENCODING THE STATE --- I HAVE REMOVED IT FOR NOW

country_label_encoder = LabelEncoder()
currency_label_encoder = LabelEncoder()
category_label_encoder = LabelEncoder()
parent_category_label_encoder = LabelEncoder()
month_label_encoder = LabelEncoder()

country_label_encoder.fit(X_categorical_data[0])
currency_label_encoder.fit(X_categorical_data[1])
category_label_encoder.fit(X_categorical_data[2])
parent_category_label_encoder.fit(X_categorical_data[3])
month_label_encoder.fit(X_categorical_data[4])

country_encoded = country_label_encoder.transform(X_categorical_data[0])
currency_encoded = currency_label_encoder.transform(X_categorical_data[1])
category_encoded = category_label_encoder.transform(X_categorical_data[2])
parent_category_encoded = parent_category_label_encoder.transform(X_categorical_data[3])
month_encoded = month_label_encoder.transform(X_categorical_data[4])

# country_categorical = to_categorical(country_encoded)
# currency_categorical = to_categorical(currency_encoded)
# category_categorical = to_categorical(category_encoded)
# parent_category_categorical = to_categorical(parent_category_encoded)

In [20]:
X['parent_category'] = parent_category_encoded
X['country'] = country_encoded
X['currency'] = currency_encoded
X['category'] = category_encoded
X['month'] = month_encoded
X.tail()

Unnamed: 0_level_0,campaign_length,blurb_length,blurb_sentiment_compound,blurb_sentiment_positive,blurb_sentiment_negative,blurb_sentiment_neutral,title_length,title_sentiment_compound,title_sentiment_positive,title_sentiment_negative,title_sentiment_neutral,usd_goal,parent_category,country,currency,category,month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
188133,30.0,86,0.4019,0.172,0.0,0.828,49,0.0,0.0,0.0,1.0,1200.0,13,21,13,133,8
188134,30.0,117,0.4767,0.194,0.0,0.806,37,0.0,0.0,0.0,1.0,400000.0,13,21,13,133,11
188135,30.0,0,0.0,0.0,0.0,1.0,14,0.0,0.0,0.0,1.0,1500000.0,13,21,13,133,2
188136,29.96,127,0.3164,0.113,0.0,0.887,32,0.0,0.0,0.0,1.0,25000.0,13,21,13,133,3
188137,59.96,111,0.7506,0.33,0.0,0.67,27,0.0,0.0,0.0,1.0,80000.0,13,21,13,133,7


In [21]:
X_scaler = StandardScaler().fit(X)
X_scaled = X_scaler.transform(X)

In [22]:
y_label_encoder = LabelEncoder()
y_label_encoder.fit(y)
y_encoded = y_label_encoder.transform(y)
y_categorical = to_categorical(y_encoded)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
train_score = rf.score(X_train, y_train)
test_score = rf.score(X_test, y_test)

In [25]:
importances = rf.feature_importances_
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.14874010269435251, 'category'),
 (0.12492683604794318, 'usd_goal'),
 (0.093217568993944552, 'title_length'),
 (0.086769354718654698, 'parent_category'),
 (0.083827462299323752, 'blurb_length'),
 (0.07990412863535816, 'campaign_length'),
 (0.065556676823374946, 'blurb_sentiment_compound'),
 (0.062966795947168641, 'blurb_sentiment_neutral'),
 (0.059682673917949638, 'month'),
 (0.058620720444886872, 'blurb_sentiment_positive'),
 (0.027216751959787934, 'title_sentiment_compound'),
 (0.025820951052620385, 'title_sentiment_neutral'),
 (0.023377342724828679, 'blurb_sentiment_negative'),
 (0.021603199450215223, 'title_sentiment_positive'),
 (0.014716622149512739, 'country'),
 (0.013621729868518935, 'currency'),
 (0.0094310822715591262, 'title_sentiment_negative')]

In [26]:
print(train_score)
print(test_score)

0.999985825856
0.738450090358


In [None]:
# from sklearn.externals import joblib
# joblib.dump(rf, f'models/{model_name}.pkl') 

In [None]:
# loaded_model = joblib.load(f'models/{model_name}.pkl')

In [None]:
# loaded_model.score(X_num_test, y_test)