# Project 3: Predicting the Success of a Kickstarter Campaign
A supervised learning exercise featuring logistic regression, SVM, KNN, and [Others?]

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy
import os
import json

In [2]:
import sys
sys.executable

'/Users/brianmcmahon/anaconda3/envs/tensorflow1.4/bin/python'

In [3]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

def to_str(obj):
    return str(obj).split(' ')[0]

def count_list_items(list):
    try:
        return len(list)
    except:
        return 0

In [4]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [5]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [6]:
# data at WebRobots https://webrobots.io/kickstarter-datasets/
csv_range = range(1, 49)

df = pd.DataFrame()
for csv in csv_range:
    csv = str(csv).zfill(3)
    path ='data/Kickstarter_2018-01-12T10_20_09_196Z/Kickstarter{}.csv'.format(csv)
    frame = pd.read_csv(path)

    frame[csv] = csv
    df = df.append(frame, ignore_index=True)

In [7]:
print(df.shape)
print(df.columns)
df.head()

(192716, 85)
Index(['001', '002', '003', '004', '005', '006', '007', '008', '009', '010',
       '011', '012', '013', '014', '015', '016', '017', '018', '019', '020',
       '021', '022', '023', '024', '025', '026', '027', '028', '029', '030',
       '031', '032', '033', '034', '035', '036', '037', '038', '039', '040',
       '041', '042', '043', '044', '045', '046', '047', '048', 'backers_count',
       'blurb', 'category', 'converted_pledged_amount', 'country',
       'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')


Unnamed: 0,001,002,003,004,005,006,007,008,009,010,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,1,,,,,,,,,,...,stardust-limited-edition-print-launch,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435485613,1.529099,"{""web"":{""project"":""https://www.kickstarter.com...",2061.225115,domestic
1,1,,,,,,,,,,...,duality-deck-artist-oracle-cards-and-book,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435784557,1.0968,"{""web"":{""project"":""https://www.kickstarter.com...",5442.323089,domestic
2,1,,,,,,,,,,...,rainbow-fantasy-fairy-art-postcard-pixiepocalypse,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435795239,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",5798.0,domestic
3,1,,,,,,,,,,...,day-of-the-dead-thor-calavera-vinyl-stickers,https://www.kickstarter.com/discover/categorie...,True,False,successful,1434418124,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",817.0,domestic
4,1,,,,,,,,,,...,monster-book-project,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435686362,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2560.0,domestic


In [8]:
df['category'].iloc[30000]

'{"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/crafts/woodworking"}},"color":16744876,"parent_id":26,"name":"Woodworking","id":356,"position":14,"slug":"crafts/woodworking"}'

In [9]:
df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [10]:
# import re

df['category_main'] = [json.loads(x)['urls']['web']['discover'][:].split('/')[5] for x in df['category']]
df['category_main'] = df['category_main'].replace({'film%20&%20video': 'film_and_video'})
# rx = re.compile()
df['category_main'].unique()

array(['art', 'comics', 'crafts', 'dance', 'design', 'fashion',
       'film_and_video', 'food', 'games', 'journalism', 'music',
       'photography', 'publishing', 'technology', 'theater'], dtype=object)

In [11]:
df['category_name'] = [json.loads(x)['name'] for x in df['category']]
df['creator_name'] = [json.loads(x)['name'] for x in df['creator']]
df['blurb_len2'] = df['blurb'].str.lower().str.split()
df['blurb_length'] = df['blurb_len2'].apply(count_list_items)

In [12]:
df['created_at'] = pd.to_datetime(df['created_at'],unit='s')
df['launched_at'] = pd.to_datetime(df['launched_at'],unit='s')
df['deadline'] = pd.to_datetime(df['deadline'],unit='s')

df['created'] = pd.to_datetime(df['created_at'])
df['launched'] = pd.to_datetime(df['launched_at'])
df['deadline'] = pd.to_datetime(df['deadline'])

df['created'] = df.created.apply(lambda x: x.date())
df['launched'] = df.launched.apply(lambda x: x.date())
df['deadline'] = df.deadline.apply(lambda x: x.date())

df.spotlight = df.spotlight.astype(int)
df.staff_pick = df.staff_pick.astype(int)


# print(df['launched'].iloc[0])

# df['created'] = df['created'].apply(lambda x: x.strftime('%Y-%m-%d'))
# df['launched'] = df['launched'].apply(lambda x: x.strftime('%Y-%m-%d'))
# df['deadline'] = df['deadline'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['campaign_length'] = (df['deadline'] - df['launched']).apply(to_str)

df['pct_goal_achieved'] = round((df['usd_pledged'] / df['goal']),1)
df.shape

(178076, 94)

In [13]:

df = df.sort_values(['deadline'], ascending=[False])
df.shape

(178076, 94)

In [14]:
df = df[['id','name','state','category_main','category_name','backers_count','pct_goal_achieved','usd_pledged','goal','country','currency','campaign_length',
         'deadline','launched','created','spotlight','staff_pick','creator_name','blurb_length']]
df = df.dropna()
print(df.shape)
print(df.columns)
df.head()

(178075, 19)
Index(['id', 'name', 'state', 'category_main', 'category_name',
       'backers_count', 'pct_goal_achieved', 'usd_pledged', 'goal', 'country',
       'currency', 'campaign_length', 'deadline', 'launched', 'created',
       'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')


Unnamed: 0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
50474,1725323227,Inspire young girls,failed,fashion,Childrenswear,1,0.0,30.0,1300.0,US,USD,30,2018-01-12,2017-12-13,2017-12-08,0,0,Rayna,6
96852,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,1,Holly Weist,5
82814,1516902916,Dreaming Creek Brewery,failed,food,Drinks,64,0.3,6139.0,20000.0,US,USD,30,2018-01-12,2017-12-13,2017-08-11,0,0,Mike Bradley,19
97879,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,0,B.Y. Eidelman,16
49869,1361347175,New Boutique Funding for the San Antonio Stock...,failed,fashion,Ready-to-wear,0,0.0,0.0,5000.0,US,USD,15,2018-01-12,2017-12-28,2017-12-22,0,0,Darrian Fosty,26


In [15]:
df.tail()

Unnamed: 0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
159484,266044220,Help me write my second novel.,successful,publishing,Fiction,18,1.1,563.0,500.0,US,USD,30,2009-05-29,2009-04-29,2009-04-29,1,1,Brendan Mackie,24
2373,199916122,Mr. Squiggles,failed,art,Illustration,0,0.0,0.0,30.0,US,USD,10,2009-05-22,2009-05-12,2009-05-12,0,0,C.K. Sample,22
109400,2089078683,New York Makes a Book!!,successful,journalism,Journalism,110,1.1,3329.0,3000.0,US,USD,18,2009-05-16,2009-04-28,2009-04-27,1,1,We Make a Book,22
6227,1703704063,drawing for dollars,successful,art,Illustration,3,1.8,35.0,20.0,US,USD,9,2009-05-03,2009-04-24,2009-04-24,1,1,darkpony,26
980,1703704063,drawing for dollars,successful,art,Illustration,3,1.8,35.0,20.0,US,USD,9,2009-05-03,2009-04-24,2009-04-24,1,1,darkpony,26


In [16]:
df.staff_pick.sum()

25660

In [17]:
# df.state.sum()

In [18]:
df.shape

(178075, 19)

In [19]:
df = df.drop_duplicates()
df.shape

(163426, 19)

In [20]:
df.columns

Index(['id', 'name', 'state', 'category_main', 'category_name',
       'backers_count', 'pct_goal_achieved', 'usd_pledged', 'goal', 'country',
       'currency', 'campaign_length', 'deadline', 'launched', 'created',
       'spotlight', 'staff_pick', 'creator_name', 'blurb_length'],
      dtype='object')

In [21]:
num_cols = ['backers_count',
            'pct_goal_achieved',
            'usd_pledged',
            'goal',
            'campaign_length',
            'blurb_length']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

In [30]:
df.head()

Unnamed: 0,id,name,state,category_main,category_name,backers_count,pct_goal_achieved,usd_pledged,goal,country,currency,campaign_length,deadline,launched,created,spotlight,staff_pick,creator_name,blurb_length
50474,1725323227,Inspire young girls,failed,fashion,Childrenswear,1,0.0,30.0,1300.0,US,USD,30,2018-01-12,2017-12-13,2017-12-08,0,0,Rayna,6
96852,2065169465,Cotton-Top Pastries,successful,food,Small Batch,99,1.3,9858.0,7500.0,US,USD,30,2018-01-12,2017-12-13,2017-12-12,1,1,Holly Weist,5
82814,1516902916,Dreaming Creek Brewery,failed,food,Drinks,64,0.3,6139.0,20000.0,US,USD,30,2018-01-12,2017-12-13,2017-08-11,0,0,Mike Bradley,19
97879,1396766240,Ripple Playing Cards - Printed by USPCC,failed,games,Playing Cards,131,0.3,3387.0,9999.0,US,USD,38,2018-01-12,2017-12-05,2017-10-08,0,0,B.Y. Eidelman,16
49869,1361347175,New Boutique Funding for the San Antonio Stock...,failed,fashion,Ready-to-wear,0,0.0,0.0,5000.0,US,USD,15,2018-01-12,2017-12-28,2017-12-22,0,0,Darrian Fosty,26


In [31]:
df.to_pickle('data/kickstarter_data_ds2.pkl')
try:
    df.to_sql("kickstarter_data_ds2", engine, chunksize=20000, if_exists='replace')
except Exception as e:
    print(e)
    pass

# Temporary fix:

# df.to_csv('data/kickstarter_data.csv')

## State of Success [this should now be covered in EDA]

In [32]:
# df_state = df.groupby(["state"]).count().sort_values(['name'], ascending=[False])
# df_state = pd.DataFrame(df_state, columns={'name'})
# print(df_state.shape)
# df_state

In [33]:
# df_state = df.groupby(["state"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_state = pd.DataFrame(df_state, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_state.shape)
# df_state

In [34]:
# df_country = df.groupby(["country"]).count().sort_values(['name'], ascending=[False])
# df_country = pd.DataFrame(df_country, columns={'name'})
# print(df_country.shape)
# df_country

In [35]:
# df_country = df.groupby(["country"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_country = pd.DataFrame(df_country, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_country.shape)
# df_country

In [36]:
# df_currency = df.groupby(["currency"]).count().sort_values(['name'], ascending=[False])
# df_currency = pd.DataFrame(df_currency, columns={'name'})
# print(df_currency.shape)
# df_currency

In [37]:
# df_currency = df.groupby(["currency"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_currency = pd.DataFrame(df_currency, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_currency.shape)
# df_currency

In [38]:
# df_main_category = df.groupby(["main_category"]).count().sort_values(['name'], ascending=[False])
# df_main_category = pd.DataFrame(df_main_category, columns={'name'})
# print(df_main_category.shape)
# df_main_category

In [39]:
# df_main_category = df.groupby(["main_category"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_main_category = pd.DataFrame(df_main_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_main_category.shape)
# df_main_category

In [40]:
# df_category = df.groupby(["category"]).count().sort_values(['name'], ascending=[False])
# print(df_category.shape)
# df_category

In [41]:
# df_category = df.groupby(["category"]).sum().sort_values(['usd pledged'], ascending=[False])
# df_category = pd.DataFrame(df_category, columns={'usd pledged', 'usd_pledged_real','usd_goal_real'})
# print(df_category.shape)
# df_category

In [42]:
# df2 = pd.read_csv('data/Kickstarter_2018-01-12T10_20_09_196Z/Kickstarter048.csv')
# df2.head()