# Project 3: Predicting the Success of a Kickstarter Campaign
A supervised learning exercise featuring logistic regression, SVM, KNN, and [Others?]

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy
import os

In [2]:
import sys
sys.executable

'/Users/brianmcmahon/anaconda3/envs/tensorflow1.4/bin/python'

In [3]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

def to_str(obj):
    return str(obj).split(' ')[0]

In [4]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

# app.config['SQLALCHEMY_DATABASE_URI'] = DB_URL
# app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False # silence the deprecation warning

# db = SQLAlchemy(app)

In [5]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [6]:
# pd.read_sql_query('''SELECT * FROM allstarfull LIMIT 5''',engine)

In [7]:

df = pd.read_csv('data/ks-projects-201801.csv') # From kaggle dataset at https://www.kaggle.com/kemical/kickstarter-projects
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [8]:
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched']) 
df['launched'] = df['launched'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['launched'] = pd.to_datetime(df['launched']) 
df = df.sort_values(['deadline'], ascending=[False])
df['campaign_length'] = (df['deadline'] - df['launched']).apply(to_str)
df['pct_goal_achieved'] = (df['usd_pledged_real'] / df['usd_goal_real'])*100
# df = df.set_index('ID')
print(df.shape)
df.head()

(378661, 17)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
226982,22387366,Nerd Collect,Apps,Technology,GBP,2018-03-03,150000.0,2018-01-02,0.0,live,0,GB,0.0,0.0,204596.6,60,0.0
7164,1036415983,Aikyam : Onnu,Music,Music,USD,2018-03-03,10000.0,2018-01-02,174.0,live,3,US,174.0,174.0,10000.0,60,1.74
180250,1916988520,Back in Black Hills Movie,Drama,Film & Video,USD,2018-03-03,5500.0,2018-01-02,0.0,live,0,US,0.0,0.0,5500.0,60,0.0
71730,1365286494,From the Wilderness,Drama,Film & Video,USD,2018-03-02,2500.0,2018-01-01,482.0,live,19,US,0.0,482.0,2500.0,60,19.28
292034,55596200,Plateforme de statistiques,Apps,Technology,EUR,2018-03-02,9000.0,2018-01-01,0.0,live,0,FR,0.0,0.0,10923.26,60,0.0


In [9]:
# remove canceled, undefined, live and suspended
df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]
df = pd.DataFrame(df)
start_date = pd.to_datetime('2017-06-30') 
df = df[df['launched'] >= start_date] # filter from start date to current
print(df.shape)
df.tail()   

(19291, 17)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
244899,315499236,Help struggling brooklyn fashion designer with...,Apparel,Fashion,USD,2017-07-07,3500.0,2017-06-30,0.0,failed,0,US,0.0,0.0,3500.0,7,0.0
197010,2003107805,Die Cut Python Stickers,Illustration,Art,USD,2017-07-07,200.0,2017-06-30,158.0,failed,30,US,0.0,158.0,200.0,7,79.0
378594,999667180,VARIANT WARS - The Prisoners,Fiction,Publishing,USD,2017-07-07,325.0,2017-06-30,425.0,successful,7,US,425.0,425.0,325.0,7,130.769231
28603,1145118881,Softball Ring Box,Games,Games,USD,2017-07-07,150.0,2017-07-02,65.0,failed,1,US,65.0,65.0,150.0,5,43.333333
51419,1261713735,101 Piano Songs. 1 Album,Classical Music,Music,USD,2017-07-05,1000.0,2017-07-02,33.0,failed,3,US,32.0,33.0,1000.0,3,3.3


In [10]:
df['ID'] = df['ID'].apply(to_str)
df['deadline'] = df['deadline'].apply(to_str)
df['launched'] = df['launched'].apply(to_str)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19291 entries, 6201 to 51419
Data columns (total 17 columns):
ID                   19291 non-null object
name                 19291 non-null object
category             19291 non-null object
main_category        19291 non-null object
currency             19291 non-null object
deadline             19291 non-null object
goal                 19291 non-null float64
launched             19291 non-null object
pledged              19291 non-null float64
state                19291 non-null object
backers              19291 non-null int64
country              19291 non-null object
usd pledged          19291 non-null float64
usd_pledged_real     19291 non-null float64
usd_goal_real        19291 non-null float64
campaign_length      19291 non-null object
pct_goal_achieved    19291 non-null float64
dtypes: float64(6), int64(1), object(10)
memory usage: 2.6+ MB


In [12]:
try:
    df.to_sql("kickstarter_data", engine, chunksize=20000, if_exists='replace')
except Exception as e:
    print(e)
    pass

# Temporary fix:
# df.to_pickle('data/kickstarter_data.pkl')
# df.to_csv('data/kickstarter_data.csv')

## State of Success [this should now be covered in EDA]

In [13]:
df_state = df.groupby(["state"]).count().sort_values(['name'], ascending=[False])
df_state = pd.DataFrame(df_state, columns={'name'})
print(df_state.shape)
df_state

(2, 1)


Unnamed: 0_level_0,name
state,Unnamed: 1_level_1
failed,10923
successful,8368


In [14]:
df_state = df.groupby(["state"]).sum().sort_values(['usd pledged'], ascending=[False])
df_state = pd.DataFrame(df_state, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
print(df_state.shape)
df_state

(2, 3)


Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,usd pledged
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
successful,249382900.0,81238880.0,45388879.68
failed,13389830.0,793109000.0,3238970.87


In [15]:
df_country = df.groupby(["country"]).count().sort_values(['name'], ascending=[False])
df_country = pd.DataFrame(df_country, columns={'name'})
print(df_country.shape)
df_country

(22, 1)


Unnamed: 0_level_0,name
country,Unnamed: 1_level_1
US,12204
GB,2207
CA,962
AU,581
DE,519
MX,467
FR,353
IT,329
ES,279
NL,216


In [16]:
df_country = df.groupby(["country"]).sum().sort_values(['usd pledged'], ascending=[False])
df_country = pd.DataFrame(df_country, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
print(df_country.shape)
df_country

(22, 3)


Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,usd pledged
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
US,187266300.0,672317400.0,34464582.72
GB,18719780.0,46084010.0,3834655.55
AU,5774700.0,14196350.0,1476295.43
CA,7231933.0,31229990.0,1231199.71
FR,5389620.0,6461087.0,1153789.35
NL,5824917.0,4211652.0,1054833.26
DE,6270310.0,18477280.0,954465.39
SE,2641517.0,15549800.0,792333.96
HK,5019316.0,4539081.0,629163.34
ES,2572853.0,7151126.0,564411.79


In [17]:
df_currency = df.groupby(["currency"]).count().sort_values(['name'], ascending=[False])
df_currency = pd.DataFrame(df_currency, columns={'name'})
print(df_currency.shape)
df_currency

(14, 1)


Unnamed: 0_level_0,name
currency,Unnamed: 1_level_1
USD,12204
GBP,2207
EUR,1941
CAD,962
AUD,581
MXN,467
HKD,200
SGD,181
SEK,152
NZD,138


In [18]:
df_currency = df.groupby(["currency"]).sum().sort_values(['usd pledged'], ascending=[False])
df_currency = pd.DataFrame(df_currency, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
print(df_currency.shape)
df_currency

(14, 3)


Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,usd pledged
currency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
USD,187266300.0,672317400.0,34464582.72
EUR,27642810.0,73154340.0,4700376.53
GBP,18719780.0,46084010.0,3834655.55
AUD,5774700.0,14196350.0,1476295.43
CAD,7231933.0,31229990.0,1231199.71
SEK,2641517.0,15549800.0,792333.96
HKD,5019316.0,4539081.0,629163.34
CHF,3459366.0,3356332.0,494245.22
NZD,1178904.0,1463891.0,336151.89
SGD,1741028.0,2690394.0,293335.71


In [19]:
df_main_category = df.groupby(["main_category"]).count().sort_values(['name'], ascending=[False])
df_main_category = pd.DataFrame(df_main_category, columns={'name'})
print(df_main_category.shape)
df_main_category

(15, 1)


Unnamed: 0_level_0,name
main_category,Unnamed: 1_level_1
Games,2541
Design,2127
Technology,2081
Publishing,2005
Film & Video,1943
Music,1774
Fashion,1731
Art,1466
Food,1167
Comics,824


In [20]:
df_main_category = df.groupby(["main_category"]).sum().sort_values(['usd pledged'], ascending=[False])
df_main_category = pd.DataFrame(df_main_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
print(df_main_category.shape)
df_main_category

(15, 3)


Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,usd pledged
main_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Games,70608172.84,48148460.0,16012906.59
Design,66428025.84,52197320.0,11227552.65
Technology,56029162.8,106528000.0,9839204.61
Film & Video,13787683.4,396225000.0,1888845.34
Publishing,10911971.51,22718820.0,1824525.49
Fashion,9388519.35,41281780.0,1816747.06
Comics,5644728.9,5096887.0,1352001.51
Art,6768373.09,110744100.0,1263911.07
Music,8672763.71,25039470.0,1218577.27
Food,7374668.61,31109740.0,1165028.68


In [21]:
df_category = df.groupby(["category"]).count().sort_values(['name'], ascending=[False])
print(df_category.shape)
df_category

(157, 16)


Unnamed: 0_level_0,ID,name,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,campaign_length,pct_goal_achieved
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Product Design,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675,1675
Tabletop Games,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281
Music,613,613,613,613,613,613,613,613,613,613,613,613,613,613,613,613
Apparel,595,595,595,595,595,595,595,595,595,595,595,595,595,595,595,595
Video Games,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556
Accessories,528,528,528,528,528,528,528,528,528,528,528,528,528,528,528,528
Apps,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503
Art,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478
Children's Books,427,427,427,427,427,427,427,427,427,427,427,427,427,427,427,427
Documentary,381,381,381,381,381,381,381,381,381,381,381,381,381,381,381,381


In [22]:
df_category = df.groupby(["category"]).sum().sort_values(['usd pledged'], ascending=[False])
df_category = pd.DataFrame(df_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
print(df_category.shape)
df_category

(157, 3)


Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,usd pledged
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tabletop Games,59007489.07,15384499.07,13415509.16
Product Design,54426113.02,34575051.56,9595443.55
Hardware,11177130.20,10072078.60,2111388.14
Technology,8161553.50,27381255.88,1930757.19
Wearables,8721065.16,4006337.79,1748151.62
Design,9818868.51,4752523.21,1463699.06
Video Games,6294961.38,17368431.54,1347752.60
Gadgets,6038475.80,7572407.85,941613.56
Apparel,3609987.18,9163228.58,663330.96
3D Printing,2952991.02,1519064.73,662054.09
