# Kickstarter_survivors
### Sifan Liu

## Set up and load data

In [1]:
# Load packages and set up
import numpy as np
import pandas as pd
import json
import glob

def CustomParser(data):
    import json
    try:
        j1 = json.loads(data)
        return j1
    except ValueError:
        pass

JSONconverters = {'location':CustomParser, 'category':CustomParser, 'creator':CustomParser}


In [2]:
# read data

# from csv =============================================
# all files --------------------------------------------
# path = r'../datasample/Kickstarter_2018-02-15T03_20_44_743Z'
# all_files = glob.glob(path + "/*.csv")

# df_from_each_file = (pd.read_csv(f,converters = JSONconverters) for f in all_files)
# df = pd.concat(df_from_each_file, ignore_index = True)

In [3]:
# sample_data ------------------------------------------
df = pd.read_csv('source/kickstarter_sample.csv', converters = JSONconverters)

In [4]:
# extract json information to columns
def JsonConcate(dataframe, column):
    temp = dataframe[column].apply(pd.Series)
    temp = temp.rename(columns = lambda x : column + '_' + str(x))
    dataframe = pd.concat([dataframe[:],temp[:]],axis =1)
    return dataframe

for col in ('location','category','creator'):
    df = JsonConcate(df,col)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4090 entries, 0 to 4089
Data columns (total 62 columns):
backers_count                4090 non-null int64
blurb                        4088 non-null object
category                     4090 non-null object
converted_pledged_amount     4090 non-null int64
country                      4090 non-null object
created_at                   4090 non-null int64
creator                      4090 non-null object
currency                     4090 non-null object
currency_symbol              4090 non-null object
currency_trailing_code       4090 non-null bool
current_currency             4090 non-null object
deadline                     4090 non-null int64
disable_communication        4090 non-null bool
fx_rate                      4090 non-null float64
goal                         4090 non-null float64
id                           4090 non-null int64
is_starrable                 4090 non-null bool
launched_at                  4090 non-null int64
nam

In [48]:
# df.to_pickle("master")
df = pd.read_pickle("master")

In [5]:
place2msa = pd.read_csv("source/place2msa.csv", encoding = 'latin-1')
place2msa.head()

Unnamed: 0.1,Unnamed: 0,STATE,STATEFP,PLACEFP,PLACENAME,TYPE,FUNCSTAT,cty_name,cty_alt,FIPS_City,cbsa,cbsa_name,metro_micro,csa_name,st_name,countyFIPS,central_outlying,fips,top100,Frey52
0,1,AL,1,100,Abanda CDP,Census Designated Place,S,Chambers County,,100100,46740.0,"Valley, AL",Micropolitan Statistical Area,"Columbus-Auburn-Opelika, GA-AL",Alabama,17.0,Central,1017.0,0.0,0.0
1,2,AL,1,124,Abbeville city,Incorporated Place,A,Henry County,,100124,20020.0,"Dothan, AL",Metropolitan Statistical Area,"Dothan-Enterprise-Ozark, AL",Alabama,67.0,Outlying,1067.0,0.0,0.0
2,3,AL,1,460,Adamsville city,Incorporated Place,A,Jefferson County,,100460,13820.0,"Birmingham-Hoover, AL",Metropolitan Statistical Area,"Birmingham-Hoover-Talladega, AL",Alabama,73.0,Central,1073.0,1.0,1.0
3,4,AL,1,484,Addison town,Incorporated Place,A,Winston County,,100484,,,,,,,,,,
4,5,AL,1,676,Akron town,Incorporated Place,A,Hale County,,100676,46220.0,"Tuscaloosa, AL",Metropolitan Statistical Area,,Alabama,65.0,Outlying,1065.0,0.0,0.0


## Clean Kickstarter dataset

In [49]:
# convert unix time
time_cols = ['created_at', 'deadline', 'state_changed_at','launched_at']
df[time_cols] = df[time_cols].apply(pd.to_datetime,unit='s')

In [50]:
# calculate duration
df['life'] = df['deadline'] - df['launched_at']
df['duration'] = df['state_changed_at'] - df['launched_at']
df['prep'] =  df['launched_at'] - df['created_at']

# df['state_changed_at'][0].year

In [51]:
# factorize project state
df['state'] = df['state'].astype('category')
pd.pivot_table(df, index = ['state'])

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [27]:
# generate gender by first name
# STEP 1: strip first name
df['first_name'] = df['creator_name'].str.extract(r'([^\s]+)',expand = False).str.title()

# STEP 2: gender
# https://pypi.org/project/gender-guesser/
import gender_guesser.detector as gender
d = gender.Detector()

# STEP 3: apply
df['gender'] = df['first_name'].apply(d.get_gender)

pd.pivot_table(df, index = ['gender'],aggfunc = len)

In [42]:

# match place to MSA
# remove 'city'|'town'|'CDP'

place2msa['place'] = place2msa['PLACENAME'].str.replace(r'(\s\w+)$','')
place2msa['place']

df = pd.merge(df,place2msa,how = 'left',left_on=['location_state','location_localized_name'],right_on=['STATE','place'])


## Summary: selected columns

In [45]:
# keep selected columns
col = ['location_country','location_state','location_localized_name','cbsa_name','top100',
       'backers_count','goal','pledged','creator_name','gender','category_slug','category_name',
       'launched_at','deadline','state_changed_at','state','life','duration','prep']

df_sample = df[col]
df_sample
# table = pd.pivot_table(df_sample, index=["location_state","location_country"], aggfunc = [np.mean,len])
# table.query('location_country == ["US"]')

Unnamed: 0,location_country,location_state,location_localized_name,cbsa_name,top100,backers_count,goal,pledged,creator_name,gender,category_slug,category_name,launched_at,deadline,state_changed_at,state,life,duration,prep
0,US,CO,Paonia,,,80,2800.0,3596.00,Hilary Emerson Lay,female,art/mixed media,Mixed Media,2012-03-28 21:14:20,2012-04-19 20:16:00,2012-04-19 20:16:00,successful,21 days 23:01:40,21 days 23:01:40,80 days 21:19:15
1,US,CA,Bakersfield,"Bakersfield, CA",1.0,47,3900.0,4117.00,Csub Arts Humanities Matter,unknown,art,Art,2012-03-28 22:06:38,2012-04-20 22:06:38,2012-04-20 22:06:38,successful,23 days 00:00:00,23 days 00:00:00,25 days 23:37:16
2,US,MA,Lowell,"Boston-Cambridge-Newton, MA-NH",1.0,80,750.0,3125.00,J.J. Long,unknown,art/painting,Painting,2012-03-28 23:01:19,2012-04-17 03:59:00,2012-04-17 03:59:03,successful,19 days 04:57:41,19 days 04:57:44,4 days 08:45:12
3,US,MA,Lowell,"Boston-Cambridge-Newton, MA-NH",1.0,80,750.0,3125.00,J.J. Long,unknown,art/painting,Painting,2012-03-28 23:01:19,2012-04-17 03:59:00,2012-04-17 03:59:03,successful,19 days 04:57:41,19 days 04:57:44,4 days 08:45:12
4,US,DC,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",1.0,82,4500.0,4586.00,Andrew Purchin,male,art/public art,Public Art,2012-03-29 01:22:25,2012-05-08 01:22:25,2012-05-08 01:22:25,successful,40 days 00:00:00,40 days 00:00:00,23 days 19:41:01
5,US,OR,Portland,"Portland-Vancouver-Hillsboro, OR-WA",1.0,31,1000.0,1036.00,Ryan Jacob Smith,mostly_male,art/painting,Painting,2012-03-29 02:57:23,2012-04-03 02:57:23,2012-04-03 02:57:23,successful,5 days 00:00:00,5 days 00:00:00,5 days 22:33:56
6,US,TX,Frisco,"Dallas-Fort Worth-Arlington, TX",1.0,21,5000.0,5217.00,Roger Belveal,male,art/sculpture,Sculpture,2012-03-29 03:28:59,2012-05-28 03:28:59,2012-05-28 03:28:59,successful,60 days 00:00:00,60 days 00:00:00,7 days 01:07:25
7,GB,Scotland,Glasgow,,,37,6500.0,7160.00,Redmond Entwistle,male,art,Art,2012-03-29 03:28:16,2012-04-19 03:28:16,2012-04-19 03:28:17,successful,21 days 00:00:00,21 days 00:00:01,7 days 12:20:45
8,US,TX,Kingsbury,"San Antonio-New Braunfels, TX",1.0,153,15000.0,15445.00,Shane Heinemeier,male,art/public art,Public Art,2012-03-29 03:37:20,2012-04-28 03:37:20,2012-04-28 03:37:22,successful,30 days 00:00:00,30 days 00:00:02,51 days 06:27:03
9,US,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",1.0,18,600.0,2190.00,Brian Dupont,male,art/painting,Painting,2012-03-29 10:44:16,2012-05-04 00:12:00,2012-05-04 00:12:01,successful,35 days 13:27:44,35 days 13:27:45,20 days 13:55:53


In [46]:
# df_sample.to_pickle("sample")