In [1]:
import os
import json
from typing import Optional, Union
import numpy as np
import pandas as pd
import seaborn as sns
import ast
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from IPython.display import display
pd.options.display.max_columns = None

In [3]:
example_csv = 'E:\KickStarter\input\Kickstarter_2020-11\kickstarter.csv'
example_table = pd.read_csv(example_csv)
example_table.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,currency_trailing_code,current_currency,deadline,disable_communication,friends,fx_rate,goal,id,is_backing,is_starrable,is_starred,launched_at,location,name,permissions,photo,pledged,profile,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,62,BFA Thesis Short Film,"{""id"":31,""name"":""Narrative Film"",""slug"":""film ...",3885,US,the United States,1574445793,"{""id"":609332928,""name"":""Valentina Hueck"",""slug...",USD,$,True,USD,1578707400,False,,1.0,3700.0,1241713622,,False,,1575475385,"{""id"":2418673,""name"":""Hatteras"",""slug"":""hatter...","pulsing orange, skinny youth",,"{""key"":""assets/027/308/662/186a2dedd2b683e311c...",3885.0,"{""id"":3858476,""project_id"":3858476,""state"":""in...",pulsing-orange-skinny-youth,https://www.kickstarter.com/discover/categorie...,True,False,successful,1578707400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",3885.0,domestic
1,0,This project is designed to help protect the e...,"{""id"":41,""name"":""Jazz"",""slug"":""music/jazz"",""po...",0,US,the United States,1276279952,"{""id"":1878580640,""name"":""Tony Copeland"",""slug""...",USD,$,True,USD,1280206740,False,,1.0,2500.0,1738971673,,False,,1276283655,"{""id"":2424766,""name"":""Houston"",""slug"":""houston...",Help Tony Copeland and get free cd's and mp3's,,"{""key"":""assets/011/263/424/3945f0b033424a7b5e2...",0.0,"{""id"":7635,""project_id"":7635,""state"":""inactive...",help-tony-copeland-and-get-free-cds-and-mp3s,https://www.kickstarter.com/discover/categorie...,False,False,failed,1280206809,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",0.0,domestic
2,20,Help send me to Washington DC where I will mak...,"{""id"":53,""name"":""Public Art"",""slug"":""art/publi...",1102,US,the United States,1352410781,"{""id"":2074957182,""name"":""Oscar Eliseo Moreno"",...",USD,$,True,USD,1355608800,False,,1.0,1100.0,627504848,,False,,1352868237,"{""id"":2514815,""name"":""Washington"",""slug"":""wash...",Public Screen Printing at the 2013 Presidentia...,,"{""key"":""assets/011/448/733/8e0c6cb320d50159d61...",1102.0,"{""id"":396147,""project_id"":396147,""state"":""inac...",public-screen-printing-at-the-2013-presidentia...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1355608832,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1102.0,domestic
3,1,Help us built a sustainable studio & eliminate...,"{""id"":362,""name"":""Makerspaces"",""slug"":""technol...",1,US,the United States,1466625193,"{""id"":279177780,""name"":""Rainforest Project_Fin...",USD,$,True,USD,1469842020,False,,1.0,25000.0,584673239,,False,,1467250090,"{""id"":59885,""name"":""Uvita"",""slug"":""uvita-hered...",Help built your magical rainsforest holiday st...,,"{""key"":""assets/012/829/735/8867d643452344060e8...",1.0,"{""id"":2570854,""project_id"":2570854,""state"":""in...",help-built-your-magical-rainsforest-holiday-stop,https://www.kickstarter.com/discover/categorie...,False,False,failed,1469842020,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1.0,domestic
4,3,"""If I paint something, I don't want to have to...","{""id"":53,""name"":""Public Art"",""slug"":""art/publi...",5,US,the United States,1549954192,"{""id"":687632789,""name"":""Bridget Ann"",""is_regis...",USD,$,True,USD,1552548146,False,,1.0,5000.0,2126450463,,False,,1549959746,"{""id"":2449323,""name"":""Memphis"",""slug"":""memphis...",B. collective: locally made fine art and gifts,,"{""key"":""assets/024/065/067/ee8e60841da38542f88...",5.0,"{""id"":3667563,""project_id"":3667563,""state"":""in...",b-collective-locally-made-fine-art-and-gifts,https://www.kickstarter.com/discover/categorie...,False,False,failed,1552548146,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",5.0,domestic


In [4]:
json.loads(example_table.urls[4])['web']['project']

'https://www.kickstarter.com/projects/687632789/b-collective-locally-made-fine-art-and-gifts?ref=discovery_category_newest'

In [5]:
def clean_df(df_raw) -> pd.DataFrame:
    df = df_raw.copy()
    df['main_category'] = df['category'].apply(
        lambda x: ast.literal_eval(x)['slug'].split('/')[0]
    )

    pledged_amount = df.pop('converted_pledged_amount')

    # launched_at and 'state_changed_at' only has 1 date: 1970, which is clearly false
    
    unix_cols = ['deadline']
    for col in unix_cols :
        df[col] = pd.to_datetime(df[col],origin='unix')
        df[col] = df[col].dt.date
    
    # we only want to look at kickstarters that have finished
    df = df[df.state.isin(['successful', 'failed'])]
    
    # target column
    target = df['state'].apply(
        lambda x: 1 if x == 'successful' else 0
    ).astype(np.int)
    assert target.nunique() == 2, f'{target.unique()}'
    
    drop_cols = ['blurb','country','creator','currency_symbol','current_currency',
                'country_displayable_name','creator', 'id', 'launched_at', 'location','photo',
                'pledged', 'profile','slug','source_url','urls','is_backing','is_starred', 'state',
                'static_usd_rate','usd_pledged','usd_type','category','friends',
                 'permissions','created_at', 'currency_trailing_code', 'deadline', 'state_changed_at']
    to_datetime_cols = []
    to_ord_cols = []
    to_one_hot_cols = ['main_category']
    # to_one_hot_cols = ['currency']
    
    try:
        df.drop(columns=drop_cols,inplace=True)
    except:
        for col in drop_cols:
            keys = df.keys()
            if col in keys: df.drop(columns=[col], inplace=True)
    
    for col in to_datetime_cols :
        df[col] = pd.to_datetime(df[col],origin='unix', format='%Y%m%d')
    
    for col in to_one_hot_cols:
        one_hot = pd.get_dummies(df[col],prefix=col,drop_first=True)
        df = df.join(one_hot)
    df.drop(columns=to_one_hot_cols,inplace=True)
    
    to_int_cols = ['disable_communication','is_starrable','spotlight','staff_pick']
    for col in to_int_cols:
        df[col] = df[col].astype(np.int)
    
    to_ord_cols = ['currency']
    for col in to_ord_cols:
        df[col] = df[col].astype("category").cat.codes

    mid = df['name']
    df.drop(labels=['name'], axis=1, inplace=True)
    df.insert(0, 'name', mid)
    
    df['target'] = target
    df['continuous_target'] = pledged_amount
    
    return df

In [6]:
def join_csv(search_path, max_tables:Optional[int]=None) -> pd.DataFrame:
    out_table = pd.DataFrame()
    if max_tables is None: max_tables = np.inf
    num_tables = 0
    for root, dirs, files in os.walk(search_path, topdown=True):
        dirs.sort(reverse=True)
        for name in files:
            if num_tables > max_tables : 
                return out_table.drop_duplicates(subset='name', keep='first')
            table_path = os.path.join(root, name)
            current_table = pd.read_csv(table_path)
            current_table = clean_df(current_table)
            
            if out_table.shape[0] == 0: 
                out_table = current_table
                continue
            
            out_table = out_table.merge(current_table, how='inner')
            
            num_tables += 1
    return out_table.drop_duplicates(subset='name', keep='first')

In [7]:
fhand = 'E:\KickStarter\input'
df = join_csv(fhand, max_tables=4)
target = df.pop('target')
continuous_target = df.pop('continuous_target')



In [8]:
assert df.isna().sum().sum() == 0
df.shape[0]

14

In [9]:
df.head()

Unnamed: 0,name,backers_count,currency,disable_communication,fx_rate,goal,is_starrable,spotlight,staff_pick,main_category_comics,main_category_crafts,main_category_dance,main_category_design,main_category_fashion,main_category_film & video,main_category_food,main_category_games,main_category_journalism,main_category_music,main_category_photography,main_category_publishing,main_category_technology,main_category_theater
0,"MY BROKEN CAMERA - ""SELF (i.e.) PORTRAIT""- Pho...",64,13,0,1.0,3000.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,"Pole Diversity: A Showcase of Pole, Aerial, an...",6,13,0,1.0,200.0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,Jeff City Outsider Magazine,12,13,0,1.0,5000.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,"Shut UP, Astoria",129,13,0,1.0,16000.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,Wild Boars,41,5,0,1.326295,1500.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [10]:
# this data is not good enough. We need to build our own scraper
# get the url's from the data

In [11]:
def clean_url(df_raw):
    df = df_raw.copy()
    
    # filter all live projects out
    df = df[df['state'].isin(['successful', 'failed'])]
    
    df['site'] = df['urls'].apply(
        lambda x: json.loads(x)['web']['project']
    )
    
    df['successful'] = df.state.apply(
        lambda x: 1 if x == 'successful' else 0 
    ).astype(np.int)
    
    df = df[['site', 'successful']]
    return df

In [12]:
def join_url(search_path, max_tables:Optional[int]=None) -> pd.DataFrame:
    out_table = pd.DataFrame()
    if max_tables is None: max_tables = np.inf
    num_tables = 1
    num_without_urls = 0
    for root, dirs, files in os.walk(search_path, topdown=True):
        dirs.sort(reverse=True)
        for name in files:
            if num_tables > max_tables : 
                print(num_without_urls)
                return out_table.drop_duplicates(subset='site')
            table_path = os.path.join(root, name)
            print(f'Joining {table_path}')
            current_table = pd.read_csv(table_path)
            keys = current_table.keys()
            if 'state' not in keys or 'urls' not in keys: 
                print('\terror in table. Skipping...')
                num_without_urls += 1
                continue
            #print(current_table.shape[0])
            current_table = clean_url(current_table)
            #print(current_table.shape[0])
            if out_table.shape[0] == 0: 
                out_table = current_table
                continue
            
            out_table = out_table.append(current_table)
            
            num_tables += 1
    print(num_without_urls)
    return out_table.drop_duplicates(subset='site')

In [13]:
df = join_url(fhand, max_tables=None)

Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter001.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter002.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2020-11\Ki

Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2020-09\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2020-09

Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2020-07\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2020-07

Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2020-05\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2020-05

Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2020-03\Kickstarter031.csv
Joining E:\KickStarter\input\Kickstarter_2020-03

Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter031.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter032.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter033.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter034.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter035.csv
Joining E:\KickStarter\input\Kickstarter_2020-01\Kickstarter036.csv
Joining E:\KickStarter\input\Kickstarter_2020-01

Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter031.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter032.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter033.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter034.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter035.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter036.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter037.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter038.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter039.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter040.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter041.csv
Joining E:\KickStarter\input\Kickstarter_2019-11\Kickstarter042.csv
Joining E:\KickStarter\input\Kickstarter_2019-11

Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter037.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter038.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter039.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter040.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter041.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter042.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter043.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter044.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter045.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter046.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter047.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter048.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter049.csv
Joining E:\KickStarter\input\Kickstarter_2019-09\Kickstarter050.csv
Joining E:\KickStarter\input\Kickstarter_2019-09

Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter044.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter045.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter046.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter047.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter048.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter049.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter050.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter051.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter052.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter053.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter054.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter055.csv
Joining E:\KickStarter\input\Kickstarter_2019-07\Kickstarter056.csv
Joining E:\KickStarter\input\Kickstarter_2019-06\Kickstarter.csv
Joining E:\KickStarter\input\Kickstarter_2019-06\Ki

Joining E:\KickStarter\input\Kickstarter_2019-05\Kickstarter051.csv
Joining E:\KickStarter\input\Kickstarter_2019-05\Kickstarter052.csv
Joining E:\KickStarter\input\Kickstarter_2019-05\Kickstarter053.csv
Joining E:\KickStarter\input\Kickstarter_2019-05\Kickstarter054.csv
Joining E:\KickStarter\input\Kickstarter_2019-05\Kickstarter055.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter001.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter002.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2019-04\Ki

Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2019-02\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2019-02

Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2018-12\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2018-12

Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter031.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter032.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter033.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter034.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter035.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter036.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter037.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter038.csv
Joining E:\KickStarter\input\Kickstarter_2018-10\Kickstarter039.csv
Joining E:\KickStarter\input\Kickstarter_2018-10

Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter042.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter043.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter044.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter045.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter046.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter047.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter048.csv
Joining E:\KickStarter\input\Kickstarter_2018-08\Kickstarter049.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter001.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter002.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2018-07\Ki

Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2018-05\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2018-05

Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter035.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter036.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter037.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter038.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter039.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter040.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter041.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter042.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter043.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter044.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter045.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter046.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter047.csv
Joining E:\KickStarter\input\Kickstarter_2018-03\Kickstarter048.csv
Joining E:\KickStarter\input\Kickstarter_2018-03

Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2017-12\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2017-12

Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter001.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter002.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2017-09\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2017-09

Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter041.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter042.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter043.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter044.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter045.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter046.csv
Joining E:\KickStarter\input\Kickstarter_2017-07\Kickstarter047.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter001.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter002.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter003.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter004.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter005.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Kickstarter006.csv
Joining E:\KickStarter\input\Kickstarter_2017-06\Ki

Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter031.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter032.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter033.csv
Joining E:\KickStarter\input\Kickstarter_2017-04\Kickstarter034.csv
Joining E:\KickStarter\input\Kickstarter_2017-04

Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2017-01\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2017-01

Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2016-10\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2016-10

Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter007.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter008.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter009.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter010.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter011.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter012.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter013.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter014.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter015.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter016.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2016-07\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2016-07

Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter017.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter018.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter019.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter020.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter021.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter022.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter023.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter024.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter025.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter026.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter027.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter028.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter029.csv
Joining E:\KickStarter\input\Kickstarter_2016-04\Kickstarter030.csv
Joining E:\KickStarter\input\Kickstarter_2016-04

In [14]:
print(f'url count: {df.shape[0]}\n')
df.head()

url count: 663996



Unnamed: 0,site,successful
0,https://www.kickstarter.com/projects/valentina...,1
1,https://www.kickstarter.com/projects/GreenThin...,0
2,https://www.kickstarter.com/projects/oscabilly...,1
3,https://www.kickstarter.com/projects/279177780...,0
4,https://www.kickstarter.com/projects/687632789...,0


In [15]:
df.site.nunique()

663996

In [21]:
df.site = df.site.astype(str).str.replace("(\?ref=discovery_category)+.*$", "/")
df['dollars_raised'] = continuous_target.astype(np.int)

In [22]:
df.to_csv('E:\KickStarter\Kickstarter_urls.csv', index=False)

In [23]:
df.site.iloc[69]

'https://www.kickstarter.com/projects/1431798382/portenas-empanadas-and-yerba-mate-bar/'

In [24]:
df.head()

Unnamed: 0,site,successful,dollars_raised
0,https://www.kickstarter.com/projects/valentina...,1,4185.0
1,https://www.kickstarter.com/projects/GreenThin...,0,235.0
2,https://www.kickstarter.com/projects/oscabilly...,1,351.0
3,https://www.kickstarter.com/projects/279177780...,0,16636.0
4,https://www.kickstarter.com/projects/687632789...,0,3128.0
