# Project 3: Predicting the Success of a Kickstarter Campaign
A supervised learning exercise featuring logistic regression, SVM, KNN, and [Others?]

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy
import os

In [2]:
import sys
sys.executable

'/Users/brianmcmahon/anaconda3/envs/tensorflow1.4/bin/python'

In [3]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

def to_str(obj):
    return str(obj).split(' ')[0]

In [4]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [5]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [6]:

df = pd.read_csv('data/ks-projects-201801.csv') # From kaggle dataset at https://www.kaggle.com/kemical/kickstarter-projects
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [7]:
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched']) 
df['launched'] = df['launched'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['launched'] = pd.to_datetime(df['launched']) 
df = df.sort_values(['deadline'], ascending=[False])
df['campaign_length'] = (df['deadline'] - df['launched']).apply(to_str)
df['pct_goal_achieved'] = round((df['usd_pledged_real'] / df['usd_goal_real'])*100,1)
# df = df.set_index('ID')
print(df.shape)
df.head()

(378661, 17)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
226982,22387366,Nerd Collect,Apps,Technology,GBP,2018-03-03,150000.0,2018-01-02,0.0,live,0,GB,0.0,0.0,204596.6,60,0.0
7164,1036415983,Aikyam : Onnu,Music,Music,USD,2018-03-03,10000.0,2018-01-02,174.0,live,3,US,174.0,174.0,10000.0,60,1.7
180250,1916988520,Back in Black Hills Movie,Drama,Film & Video,USD,2018-03-03,5500.0,2018-01-02,0.0,live,0,US,0.0,0.0,5500.0,60,0.0
71730,1365286494,From the Wilderness,Drama,Film & Video,USD,2018-03-02,2500.0,2018-01-01,482.0,live,19,US,0.0,482.0,2500.0,60,19.3
292034,55596200,Plateforme de statistiques,Apps,Technology,EUR,2018-03-02,9000.0,2018-01-01,0.0,live,0,FR,0.0,0.0,10923.26,60,0.0


In [8]:
# remove canceled, undefined, live and suspended
df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]
df = pd.DataFrame(df)
start_date = pd.to_datetime('2017-06-30') 
df = df[df['launched'] >= start_date] # filter from start date to current
print(df.shape)
df.tail()   

(19291, 17)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
244899,315499236,Help struggling brooklyn fashion designer with...,Apparel,Fashion,USD,2017-07-07,3500.0,2017-06-30,0.0,failed,0,US,0.0,0.0,3500.0,7,0.0
197010,2003107805,Die Cut Python Stickers,Illustration,Art,USD,2017-07-07,200.0,2017-06-30,158.0,failed,30,US,0.0,158.0,200.0,7,79.0
378594,999667180,VARIANT WARS - The Prisoners,Fiction,Publishing,USD,2017-07-07,325.0,2017-06-30,425.0,successful,7,US,425.0,425.0,325.0,7,130.8
28603,1145118881,Softball Ring Box,Games,Games,USD,2017-07-07,150.0,2017-07-02,65.0,failed,1,US,65.0,65.0,150.0,5,43.3
51419,1261713735,101 Piano Songs. 1 Album,Classical Music,Music,USD,2017-07-05,1000.0,2017-07-02,33.0,failed,3,US,32.0,33.0,1000.0,3,3.3


In [9]:
# df['ID'] = df['ID'].apply(to_str)
df['deadline'] = df['deadline'].apply(to_str)
df['launched'] = df['launched'].apply(to_str)
df['idx'] = df['ID']
df = df.set_index('idx')

In [10]:
# Prep for categorical analysis
# df.main_category = pd.Categorical(df.main_category) 
# df['main_category_code'] = df.main_category.cat.codes

# df.country = pd.Categorical(df.country) 
# df['country_code'] = df.country.cat.codes

# df.currency = pd.Categorical(df.currency) 
# df['currency_code'] = df.currency.cat.codes

# df.state = pd.Categorical(df.state) 
# df['state_code'] = df.state.cat.codes

# df.head(100)

In [11]:
# df['main_category'] = pd.get_dummies('main_category')
# df.columns

In [12]:
df_dummies = pd.get_dummies(df[['state','main_category','country','currency']],drop_first=True)
# df['state'] = pd.get_dummies(df['state'])
# df['main_category'] = pd.get_dummies(df['main_category'])
# df['country'] = pd.get_dummies(df['country'])
# df['currency'] = pd.get_dummies(df['currency'])
df['ID'].head()

idx
1031685482    1031685482
1176736360    1176736360
1894703097    1894703097
311863059      311863059
2076121120    2076121120
Name: ID, dtype: int64

In [13]:
df = df_dummies.merge(df,how='inner',left_index=True, right_index=True)

df.columns

Index(['state_successful', 'main_category_Comics', 'main_category_Crafts',
       'main_category_Dance', 'main_category_Design', 'main_category_Fashion',
       'main_category_Film & Video', 'main_category_Food',
       'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'country_AU', 'country_BE', 'country_CA',
       'country_CH', 'country_DE', 'country_DK', 'country_ES', 'country_FR',
       'country_GB', 'country_HK', 'country_IE', 'country_IT', 'country_JP',
       'country_LU', 'country_MX', 'country_NL', 'country_NO', 'country_NZ',
       'country_SE', 'country_SG', 'country_US', 'currency_CAD',
       'currency_CHF', 'currency_DKK', 'currency_EUR', 'currency_GBP',
       'currency_HKD', 'currency_JPY', 'currency_MXN', 'currency_NOK',
       'currency_NZD', 'currency_SEK', 'currency_SGD', 'currency_USD', 'ID',
       'name', 'ca

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19291 entries, 1031685482 to 1261713735
Data columns (total 66 columns):
state_successful              19291 non-null uint8
main_category_Comics          19291 non-null uint8
main_category_Crafts          19291 non-null uint8
main_category_Dance           19291 non-null uint8
main_category_Design          19291 non-null uint8
main_category_Fashion         19291 non-null uint8
main_category_Film & Video    19291 non-null uint8
main_category_Food            19291 non-null uint8
main_category_Games           19291 non-null uint8
main_category_Journalism      19291 non-null uint8
main_category_Music           19291 non-null uint8
main_category_Photography     19291 non-null uint8
main_category_Publishing      19291 non-null uint8
main_category_Technology      19291 non-null uint8
main_category_Theater         19291 non-null uint8
country_AU                    19291 non-null uint8
country_BE                    19291 non-null uint8
country_CA

In [15]:
df.head()

Unnamed: 0_level_0,state_successful,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,main_category_Journalism,...,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1031685482,1,0,0,0,0,0,0,0,0,0,...,2017-12-06,6083.0,successful,133,US,1054.0,6083.0,2000.0,27,304.2
1176736360,1,0,0,0,0,0,0,0,0,0,...,2017-11-30,11169.56,successful,177,US,650.0,11169.56,10000.0,33,111.7
1894703097,1,0,0,0,0,0,0,0,0,0,...,2017-11-28,25417.2,successful,168,DE,0.0,30615.02,30112.5,35,101.7
311863059,1,0,0,0,0,0,0,0,0,0,...,2017-12-09,1743.0,successful,39,US,0.0,1743.0,1000.0,24,174.3
2076121120,0,0,0,0,0,0,0,1,0,0,...,2017-11-03,1.0,failed,1,US,0.0,1.0,200000.0,60,0.0


In [16]:
try:
    df.to_sql("kickstarter_data", engine, chunksize=20000, if_exists='replace')
except Exception as e:
    print(e)
    pass

# Temporary fix:
# df.to_pickle('data/kickstarter_data.pkl')
# df.to_csv('data/kickstarter_data.csv')

## State of Success [this should now be covered in EDA]

In [17]:
# df_state = df.groupby(["state"]).count().sort_values(['name'], ascending=[False])
# df_state = pd.DataFrame(df_state, columns={'name'})
# print(df_state.shape)
# df_state

In [18]:
# df_state = df.groupby(["state"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_state = pd.DataFrame(df_state, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_state.shape)
# df_state

In [19]:
# df_country = df.groupby(["country"]).count().sort_values(['name'], ascending=[False])
# df_country = pd.DataFrame(df_country, columns={'name'})
# print(df_country.shape)
# df_country

In [20]:
# df_country = df.groupby(["country"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_country = pd.DataFrame(df_country, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_country.shape)
# df_country

In [21]:
# df_currency = df.groupby(["currency"]).count().sort_values(['name'], ascending=[False])
# df_currency = pd.DataFrame(df_currency, columns={'name'})
# print(df_currency.shape)
# df_currency

In [22]:
# df_currency = df.groupby(["currency"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_currency = pd.DataFrame(df_currency, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_currency.shape)
# df_currency

In [23]:
# df_main_category = df.groupby(["main_category"]).count().sort_values(['name'], ascending=[False])
# df_main_category = pd.DataFrame(df_main_category, columns={'name'})
# print(df_main_category.shape)
# df_main_category

In [24]:
# df_main_category = df.groupby(["main_category"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_main_category = pd.DataFrame(df_main_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_main_category.shape)
# df_main_category

In [25]:
# df_category = df.groupby(["category"]).count().sort_values(['name'], ascending=[False])
# print(df_category.shape)
# df_category

In [26]:
# df_category = df.groupby(["category"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_category = pd.DataFrame(df_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_category.shape)
# df_category