In [1]:
# data scrape date: 2020-09-17

# extract, transform, and load the data

# first, import our essentials
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import os
import pickle
from time import time
import random
import tensorflow as tf
from time import time
import datetime
import ast

In [2]:
def seed_everything(seed=0) :
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    try :
        tf.random.set_seed(seed)
    except :
        pass

seed = 69
seed_everything(seed=seed)

In [5]:
data = pd.DataFrame()
for root,dirs,files in os.walk('./input') :
    for file in files :
        if file.endswith('.csv') :
            path = os.path.join(root,file)
            data = data.append(pd.read_csv(path))

df = data.copy()
print(df.shape)
df.head(2)

(219688, 38)


Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,61,Support great art! Join us as we re-stage the ...,"{""id"":254,""name"":""Performances"",""slug"":""dance/...",4618,US,the United States,1579292017,"{""id"":2092817311,""name"":""Brooklyn Ballet"",""slu...",USD,$,...,revisionist-history-2,https://www.kickstarter.com/discover/categorie...,True,True,successful,1583025192,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",4618.0,domestic
1,52,JinBucha is a new kind of Brewery in North Par...,"{""id"":307,""name"":""Drinks"",""slug"":""food/drinks""...",3461,US,the United States,1446051515,"{""id"":1468694331,""name"":""Jing Chen"",""slug"":""ji...",USD,$,...,jinbucha-a-modern-kombucha-tasting-room-in-nor...,https://www.kickstarter.com/discover/categorie...,False,True,failed,1450118057,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",3461.0,domestic


In [6]:
big_cats = []
unique_values = {}
for col in df.keys() :
    if is_numeric_dtype(df[col]) : continue
    if df[col].nunique() > 15 : 
        big_cats.append(col)
        continue
    unique_values[col] = df[col].unique()

print(f'{big_cats}\n')
unique_values

['blurb', 'category', 'country', 'country_displayable_name', 'creator', 'location', 'name', 'photo', 'profile', 'slug', 'source_url', 'urls']



{'currency': array(['USD', 'AUD', 'CAD', 'DKK', 'GBP', 'EUR', 'MXN', 'SEK', 'HKD',
        'NOK', 'SGD', 'JPY', 'NZD', 'CHF'], dtype=object),
 'currency_symbol': array(['$', 'kr', '£', '€', '¥', 'Fr '], dtype=object),
 'current_currency': array(['USD', 'AUD', 'CAD', 'EUR', 'NOK'], dtype=object),
 'friends': array([nan, '[]',
        '[{"id":891191024,"name":"Roman Kotiv","is_friend":true,"is_registered":null,"chosen_currency":null,"is_superbacker":null,"avatar":{"thumb":"https://ksr-ugc.imgix.net/assets/006/968/734/7c06f13b13aae2c0a1c641b5c7d81763_original.jpg?ixlib=rb-2.1.0&w=40&h=40&fit=crop&v=1461424750&auto=format&frame=1&q=92&s=cc28cbfe597926dcb69288efb94db69c","small":"https://ksr-ugc.imgix.net/assets/006/968/734/7c06f13b13aae2c0a1c641b5c7d81763_original.jpg?ixlib=rb-2.1.0&w=80&h=80&fit=crop&v=1461424750&auto=format&frame=1&q=92&s=02d246bff97c93b1f586464b698946b2","medium":"https://ksr-ugc.imgix.net/assets/006/968/734/7c06f13b13aae2c0a1c641b5c7d81763_original.jpg?ixlib=rb-2.1.0&w

In [11]:
def clean_columns(df) :
    df = df.copy()
    df['main_category'] = df['category'].apply(
        lambda x: ast.literal_eval(x)['slug'].split('/')[0]
    )

    df['successful'] = pd.Series(df['goal'] <= df['converted_pledged_amount'],dtype=np.int8)
    df.drop(columns=['goal','converted_pledged_amount'],inplace=True)

    unix_cols = ['launched_at','deadline','state_changed_at']
    for col in unix_cols :
        df[col] = pd.to_datetime(df[col],origin='unix')

    scrape_date = datetime.datetime(2020,9,17)
    df = df[df['deadline'] < scrape_date]
    df['deadline'] = (df['deadline'] - df['launched_at']).dt.days

    drop_cols = ['blurb','country','creator','currency_symbol','current_currency',
                'country_displayable_name','creator','location','name','photo',
                'profile','slug','source_url','urls','id','is_backing','is_starred',
                'static_usd_rate','usd_pledged','usd_type','category','friends','permissions']
    to_datetime_cols = ['created_at']
    to_ord_cols = []
    to_one_hot_cols = ['currency']

    df.drop(columns=drop_cols,inplace=True)

    
    
    for col in to_datetime_cols :
        df[col] = pd.to_datetime(df[col],origin='unix')
    
    for col in to_one_hot_cols:
        pd.get_dummies(df[col],prefix=col,drop_first=True)
        df.drop(columns=[col],inplace=True)

    return df

In [12]:
df_clean = clean_columns(df)

In [13]:
df_clean['main_category'].unique()

array(['dance', 'food', 'design', 'fashion', 'publishing', 'music', 'art',
       'theater', 'journalism', 'photography', 'technology',
       'film & video', 'games', 'comics', 'crafts'], dtype=object)

In [15]:
df_clean.head()

Unnamed: 0,backers_count,created_at,currency_trailing_code,deadline,disable_communication,fx_rate,is_starrable,launched_at,pledged,spotlight,staff_pick,state,state_changed_at,main_category,successful
0,61,1970-01-01 00:00:01.579292017,True,0,False,1.0,False,1970-01-01 00:00:01.580433192,4618.0,True,True,successful,1970-01-01 00:00:01.583025192,dance,1
1,52,1970-01-01 00:00:01.446051515,True,0,False,1.0,False,1970-01-01 00:00:01.447526057,3461.0,False,True,failed,1970-01-01 00:00:01.450118057,food,0
2,0,1970-01-01 00:00:01.518159717,True,0,False,1.0,False,1970-01-01 00:00:01.518208887,0.0,False,False,failed,1970-01-01 00:00:01.520233260,food,0
3,2,1970-01-01 00:00:01.454217596,True,0,False,1.0,False,1970-01-01 00:00:01.454705444,25.0,False,False,failed,1970-01-01 00:00:01.459885844,food,0
4,150,1970-01-01 00:00:01.571342290,True,0,False,1.0,False,1970-01-01 00:00:01.573236000,3275.0,True,False,successful,1970-01-01 00:00:01.574445600,design,1
