# Project 3: Predicting the Success of a Kickstarter Campaign
A supervised learning exercise featuring logistic regression, SVM, KNN, and [Others?]

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy
import matplotlib.pyplot as plt
import os
import json

In [2]:
import sys
sys.executable

'/Users/brianmcmahon/anaconda3/envs/tensorflow1.4/bin/python'

In [3]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

def to_str(obj):
    return str(obj).split(' ')[0]

def count_list_items(list):
    try:
        return len(list)
    except:
        return 0

In [4]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [5]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [6]:
# data at WebRobots https://webrobots.io/kickstarter-datasets/
csv_range = range(1, 49)

df = pd.DataFrame()
for csv in csv_range:
    csv = str(csv).zfill(3)
    path ='data/Kickstarter_2018-01-12T10_20_09_196Z/Kickstarter{}.csv'.format(csv)
    frame = pd.read_csv(path)

    frame[csv] = csv
    df = df.append(frame) # , ignore_index=True)

In [7]:
print(df.shape)
print(df.columns)
df.head()

(192716, 85)
Index(['001', '002', '003', '004', '005', '006', '007', '008', '009', '010',
       '011', '012', '013', '014', '015', '016', '017', '018', '019', '020',
       '021', '022', '023', '024', '025', '026', '027', '028', '029', '030',
       '031', '032', '033', '034', '035', '036', '037', '038', '039', '040',
       '041', '042', '043', '044', '045', '046', '047', '048', 'backers_count',
       'blurb', 'category', 'converted_pledged_amount', 'country',
       'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')


Unnamed: 0,001,002,003,004,005,006,007,008,009,010,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,1,,,,,,,,,,...,stardust-limited-edition-print-launch,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435485613,1.529099,"{""web"":{""project"":""https://www.kickstarter.com...",2061.225115,domestic
1,1,,,,,,,,,,...,duality-deck-artist-oracle-cards-and-book,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435784557,1.0968,"{""web"":{""project"":""https://www.kickstarter.com...",5442.323089,domestic
2,1,,,,,,,,,,...,rainbow-fantasy-fairy-art-postcard-pixiepocalypse,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435795239,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",5798.0,domestic
3,1,,,,,,,,,,...,day-of-the-dead-thor-calavera-vinyl-stickers,https://www.kickstarter.com/discover/categorie...,True,False,successful,1434418124,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",817.0,domestic
4,1,,,,,,,,,,...,monster-book-project,https://www.kickstarter.com/discover/categorie...,True,False,successful,1435686362,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2560.0,domestic


In [8]:
# df['category'].iloc[30000]

In [9]:
df = df[['backers_count','blurb', 'category', 'converted_pledged_amount', 'country','created_at', 
         'creator', 'currency', 'currency_symbol','currency_trailing_code', 'current_currency', 
         'deadline','disable_communication', 'friends', 'fx_rate', 'goal', 'id','is_backing', 
         'is_starrable', 'is_starred', 'launched_at', 'location','name', 'permissions', 'photo', 
         'pledged', 'profile', 'slug','source_url', 'staff_pick', 'state', 'state_changed_at',
        'static_usd_rate', 'urls', 'usd_pledged', 'usd_type']]
df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [10]:
# import re

df['category_main'] = [json.loads(x)['urls']['web']['discover'][:].split('/')[5] for x in df['category']]
df['category_main'] = df['category_main'].replace({'film%20&%20video': 'film_and_video'})
# rx = re.compile()
df['category_main'].unique()

array(['art', 'comics', 'crafts', 'dance', 'design', 'fashion',
       'film_and_video', 'food', 'games', 'journalism', 'music',
       'photography', 'publishing', 'technology', 'theater'], dtype=object)

In [11]:
df['category_name'] = [json.loads(x)['name'] for x in df['category']]
df['creator_name'] = [json.loads(x)['name'] for x in df['creator']]
df['blurb_len2'] = df['blurb'].str.lower().str.split()
df['blurb_length'] = df['blurb_len2'].apply(count_list_items)

In [12]:
df['created_at'] = pd.to_datetime(df['created_at'],unit='s')
df['launched_at'] = pd.to_datetime(df['launched_at'],unit='s')
df['deadline'] = pd.to_datetime(df['deadline'],unit='s')

df['created'] = pd.to_datetime(df['created_at'])
df['launched'] = pd.to_datetime(df['launched_at'])
df['deadline'] = pd.to_datetime(df['deadline'])

df['created'] = df.created.apply(lambda x: x.date())
df['launched'] = df.launched.apply(lambda x: x.date())
df['deadline'] = df.deadline.apply(lambda x: x.date())

# df.spotlight = df.spotlight.astype(int)
df.staff_pick = df.staff_pick.astype(int)


# print(df['launched'].iloc[0])

# df['created'] = df['created'].apply(lambda x: x.strftime('%Y-%m-%d'))
# df['launched'] = df['launched'].apply(lambda x: x.strftime('%Y-%m-%d'))
# df['deadline'] = df['deadline'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['campaign_length'] = (df['deadline'] - df['launched']).apply(to_str)

df.shape

num_cols = ['backers_count',
            'pct_goal_achieved',
            'pledged',
            'usd_pledged',
            'goal',
            'campaign_length',
            'blurb_length',
            'fx_rate']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')
df.columns

KeyError: "['pct_goal_achieved'] not in index"

In [None]:
df['usd_goal'] = df['goal'] * (df['usd_pledged']/df['pledged'])
df['pct_goal_achieved'] = round((df['usd_pledged'] / df['usd_goal']),1)

In [None]:
df = df.sort_values(['deadline'], ascending=[False])
df.shape

In [None]:
df = df[['id','name','state','category_main','category_name','backers_count','pct_goal_achieved','usd_pledged','usd_goal','country','currency','campaign_length',
         'deadline','launched','created','staff_pick','creator_name','blurb_length']]
df = df.dropna()
print(df.shape)
print(df.columns)
df.head()

In [None]:
df.info()
# from pandas.plotting import table

# ax = plt.subplot(111, frame_on=False) # no visible frame
# ax.xaxis.set_visible(False)  # hide the x axis
# ax.yaxis.set_visible(False)  # hide the y axis

# table(ax, df)  # where df is your data frame

# plt.savefig('charts/feature_list.png')

In [None]:
df.tail()

In [None]:
df.staff_pick.sum()

In [None]:
# df.state.sum()

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# df.to_pickle('data/kickstarter_data_ds2.pkl')
# try:
#     df.to_sql("kickstarter_data_ds2", engine, chunksize=20000, if_exists='replace')
# except Exception as e:
#     print(e)
#     pass

# Temporary fix:

# df.to_csv('data/kickstarter_data.csv')