# Kickstarter_survivors
### Sifan Liu

## Set up and load data

In [2]:
# Load packages and set up
import numpy as np
import pandas as pd
import json
import glob

def CustomParser(data):
    import json
    try:
        j1 = json.loads(data)
        return j1
    except ValueError:
        pass

JSONconverters = {'location':CustomParser, 'category':CustomParser, 'creator':CustomParser}


In [19]:
# read data from csv

def ReadAllfiles(path):
    all_files = glob.glob(path + "/*.csv")
    df_from_each_file = (pd.read_csv(f,converters = JSONconverters) for f in all_files)
    df = pd.concat(df_from_each_file, ignore_index = True)
    return df

path2018 = r'source/Kickstarter_2018-02-15T03_20_44_743Z'
path2017 = r'source/Kickstarter_2017-02-15T22_22_48_377Z'
path2015 = r'source/Kickstarter_2015-12-17T12_09_06_107Z'

df2015 = ReadAllfiles(path2015)
df2017 = ReadAllfiles(path2017)
df2018 = ReadAllfiles(path2018)

df = pd.concat([df2018, df2017, df2015]).drop_duplicates(["id"]).reset_index(drop=True)

In [8]:
# sample_data ------------------------------------------
# df = pd.read_csv('source/kickstarter_sample.csv', converters = JSONconverters)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212743 entries, 0 to 212742
Data columns (total 37 columns):
backers_count               212743 non-null int64
blurb                       212728 non-null object
category                    212743 non-null object
converted_pledged_amount    68480 non-null float64
country                     212743 non-null object
created_at                  212743 non-null int64
creator                     212743 non-null object
currency                    212743 non-null object
currency_symbol             212743 non-null object
currency_trailing_code      212743 non-null bool
current_currency            68480 non-null object
deadline                    212743 non-null int64
disable_communication       212743 non-null bool
friends                     63 non-null object
fx_rate                     68480 non-null float64
goal                        212743 non-null float64
id                          212743 non-null int64
is_backing                  63 non

In [20]:
# extract json information to columns
def JsonConcate(dataframe, column):
    temp = dataframe[column].apply(pd.Series)
    temp = temp.rename(columns = lambda x : column + '_' + str(x))
    dataframe = pd.concat([dataframe[:],temp[:]],axis =1)
    return dataframe

for col in ('location','category','creator'):
    df = JsonConcate(df,col)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229158 entries, 0 to 229157
Data columns (total 62 columns):
backers_count                229158 non-null int64
blurb                        229142 non-null object
category                     229158 non-null object
converted_pledged_amount     177768 non-null float64
country                      229158 non-null object
created_at                   229158 non-null int64
creator                      229158 non-null object
currency                     229158 non-null object
currency_symbol              229158 non-null object
currency_trailing_code       229158 non-null bool
current_currency             177768 non-null object
deadline                     229158 non-null int64
disable_communication        229158 non-null bool
friends                      88 non-null object
fx_rate                      177768 non-null float64
goal                         229158 non-null float64
id                           229158 non-null int64
is_backing    

In [10]:
place2msa = pd.read_csv("source/place2msa.csv", encoding = 'latin-1')
# remove 'city'|'town'|'CDP'
place2msa['place'] = place2msa['PLACENAME'].str.replace(r'(\s\w+)$','')

place2msa.sample(5)

Unnamed: 0.1,Unnamed: 0,STATE,STATEFP,PLACEFP,PLACENAME,TYPE,FUNCSTAT,cty_name,cty_alt,FIPS_City,...,cbsa_name,metro_micro,csa_name,st_name,countyFIPS,central_outlying,fips,top100,Frey52,place
1693,1694,AR,5,43100,McNeil city,Incorporated Place,A,Columbia County,,543100,...,"Magnolia, AR",Micropolitan Statistical Area,,Arkansas,27.0,Central,5027.0,0.0,0.0,McNeil
3682,3683,CO,8,41560,Kremmling town,Incorporated Place,A,Grand County,,841560,...,,,,,,,,,,Kremmling
27592,27593,OK,40,8150,Boynton town,Incorporated Place,A,Muskogee County,,4008150,...,"Muskogee, OK",Micropolitan Statistical Area,"Tulsa-Muskogee-Bartlesville, OK",Oklahoma,101.0,Central,40101.0,0.0,0.0,Boynton
11978,11979,MD,24,59450,Oxford town,Incorporated Place,A,Talbot County,,2459450,...,"Easton, MD",Micropolitan Statistical Area,"Washington-Baltimore-Arlington, DC-MD-VA-WV-PA",Maryland,41.0,Central,24041.0,0.0,0.0,Oxford
1039,1040,AZ,4,19790,Donovan Estates CDP,Census Designated Place,S,Yuma County,,419790,...,"Yuma, AZ",Metropolitan Statistical Area,,Arizona,27.0,Central,4027.0,0.0,0.0,Donovan Estates


In [None]:
## TODO

# merge population data

## Clean Kickstarter dataset

In [21]:
# convert unix time
time_cols = ['created_at', 'deadline', 'state_changed_at','launched_at']
df[time_cols] = df[time_cols].apply(pd.to_datetime,unit='s')

In [22]:
# calculate duration
df['life'] = df['deadline'] - df['launched_at']
df['duration'] = df['state_changed_at'] - df['launched_at']
df['prep'] =  df['launched_at'] - df['created_at']

# df['state_changed_at'][0].year

In [23]:
# factorize project state
df['state'] = df['state'].astype('category')
pd.pivot_table(df, index = ['state'])

Unnamed: 0_level_0,backers_count,category_color,category_id,category_parent_id,category_position,converted_pledged_amount,creator_id,currency_trailing_code,disable_communication,fx_rate,goal,id,location_id,pledged,spotlight,static_usd_rate,usd_pledged
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
canceled,16.78767,11280810.0,219.556126,11.943979,6.614188,1743.798255,1073440000.0,0.826941,0.0,1.016327,108419.388324,1078944000.0,3383121.0,2198.289056,0.0,1.022012,1880.34585
failed,15.353981,11898490.0,180.153355,12.3953,7.222112,1144.602144,1074790000.0,0.851321,0.0,1.017993,75222.466067,1075948000.0,3399049.0,1390.778084,0.0,1.024491,1226.156637
live,79.587602,9651019.0,112.798878,11.261356,6.388754,6548.369835,1064460000.0,0.770537,0.0,0.997732,67398.04759,1066265000.0,3699216.0,13099.43537,0.0,0.990838,6334.436266
successful,267.148666,11752410.0,101.929903,11.504089,7.62115,19341.140266,1073443000.0,0.888472,0.0,1.017327,10439.783884,1073539000.0,3267135.0,24030.501114,1.0,1.024136,22550.4535
suspended,72.659893,9900384.0,258.425668,12.954011,6.014973,3818.810316,1068083000.0,0.806417,1.0,1.018514,175103.094118,1120790000.0,3327024.0,6076.805604,0.0,1.013442,5704.382428


In [24]:
df['category_broad'] = df['category_slug'].str.extract(r'([^\/]+)', expand = False).astype('category')
df['category_broad'].describe()

count           229158
unique              15
top       film & video
freq             34528
Name: category_broad, dtype: object

In [25]:
# generate gender by first name
# STEP 1: strip first name
df['first_name'] = df['creator_name'].str.extract(r'([^\s]+)',expand = False).str.title()

# STEP 2: gender
# https://pypi.org/project/gender-guesser/
import gender_guesser.detector as gender
d = gender.Detector()

# STEP 3: apply
df['gender'] = df['first_name'].apply(d.get_gender)

pd.pivot_table(df, index = ['gender'],aggfunc = len)

Unnamed: 0_level_0,backers_count,blurb,category,category_broad,category_color,category_id,category_name,category_parent_id,category_position,category_slug,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
andy,2310,2310,2310,2310,2310,2310,2310,2310.0,2310,2310,...,2310,2310,2310,2310,2310,2310,2310.0,2310,2310.0,2310
female,39725,39725,39725,39725,39725,39725,39725,39725.0,39725,39725,...,39725,39725,39725,39725,39725,39725,39725.0,39725,39725.0,39725
male,102489,102489,102489,102489,102489,102489,102489,102489.0,102489,102489,...,102489,102489,102489,102489,102489,102489,102489.0,102489,102489.0,102489
mostly_female,8922,8922,8922,8922,8922,8922,8922,8922.0,8922,8922,...,8922,8922,8922,8922,8922,8922,8922.0,8922,8922.0,8922
mostly_male,9880,9880,9880,9880,9880,9880,9880,9880.0,9880,9880,...,9880,9880,9880,9880,9880,9880,9880.0,9880,9880.0,9880
unknown,65832,65832,65832,65832,65832,65832,65832,65832.0,65832,65832,...,65832,65832,65832,65832,65832,65832,65832.0,65832,65832.0,65832


In [26]:
# match place to MSA

df = pd.merge(df,place2msa,how = 'left',
              left_on=['location_state','location_localized_name'],
              right_on=['STATE','place'])


## Summary: selected columns

In [27]:
# keep selected columns
col = ['location_country','location_state','location_localized_name','cbsa_name','top100',
       'backers_count','goal','pledged','creator_name','gender','category_broad','category_name',
       'launched_at','deadline','state_changed_at','state','life','duration','prep']

df_sample = df[col]

# table = pd.pivot_table(df_sample, index=["location_state","location_country"], aggfunc = [np.mean,len])
# table.query('location_country == ["US"]')

In [3]:
# df = pd.read_pickle("master")

In [28]:
# df_sample.to_pickle("sample")
df_sample.to_pickle("master")

In [29]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253465 entries, 0 to 253464
Data columns (total 19 columns):
location_country           252446 non-null object
location_state             252396 non-null object
location_localized_name    201141 non-null object
cbsa_name                  146872 non-null object
top100                     146872 non-null float64
backers_count              253465 non-null int64
goal                       253465 non-null float64
pledged                    253465 non-null float64
creator_name               253465 non-null object
gender                     253465 non-null object
category_broad             253465 non-null category
category_name              253465 non-null object
launched_at                253465 non-null datetime64[ns]
deadline                   253465 non-null datetime64[ns]
state_changed_at           253465 non-null datetime64[ns]
state                      253465 non-null category
life                       253465 non-null timedelta64[ns]


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253465 entries, 0 to 253464
Data columns (total 89 columns):
backers_count                253465 non-null int64
blurb                        253449 non-null object
category                     253465 non-null object
converted_pledged_amount     202075 non-null float64
country                      253465 non-null object
created_at                   253465 non-null datetime64[ns]
creator                      253465 non-null object
currency                     253465 non-null object
currency_symbol              253465 non-null object
currency_trailing_code       253465 non-null bool
current_currency             202075 non-null object
deadline                     253465 non-null datetime64[ns]
disable_communication        253465 non-null bool
friends                      95 non-null object
fx_rate                      202075 non-null float64
goal                         253465 non-null float64
id                           253465 non-null in