In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime

In [2]:
df_lst = []
for i in range(56):
    df_lst.append( pd.read_csv('./data/Kickstarter{:03d}.csv'.format(i)) )

In [3]:
df = pd.concat(df_lst, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 37 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             209222 non-null  int64  
 1   blurb                     209214 non-null  object 
 2   category                  209222 non-null  object 
 3   converted_pledged_amount  209222 non-null  int64  
 4   country                   209222 non-null  object 
 5   created_at                209222 non-null  int64  
 6   creator                   209222 non-null  object 
 7   currency                  209222 non-null  object 
 8   currency_symbol           209222 non-null  object 
 9   currency_trailing_code    209222 non-null  bool   
 10  current_currency          209222 non-null  object 
 11  deadline                  209222 non-null  int64  
 12  disable_communication     209222 non-null  bool   
 13  friends                   300 non-null     o

In [5]:
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,creator,currency,currency_symbol,currency_trailing_code,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,21,2006 was almost 7 years ago.... Can you believ...,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",802,US,1387659690,"{""id"":1495925645,""name"":""Daniel"",""is_registere...",USD,$,True,...,new-final-round-album,https://www.kickstarter.com/discover/categorie...,True,False,successful,1391899046,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",802.0,international
1,97,An adorable fantasy enamel pin series of princ...,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2259,US,1549659768,"{""id"":1175589980,""name"":""Katherine"",""slug"":""fr...",USD,$,True,...,princess-pals-enamel-pin-series,https://www.kickstarter.com/discover/categorie...,True,False,successful,1551801611,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2259.0,international
2,88,Helping a community come together to set the s...,"{""id"":280,""name"":""Photobooks"",""slug"":""photogra...",29638,US,1477242384,"{""id"":1196856269,""name"":""MelissaThomas"",""is_re...",USD,$,True,...,their-life-through-their-lens-the-amish-and-me...,https://www.kickstarter.com/discover/categorie...,True,True,successful,1480607932,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",29638.0,international
3,193,Every revolution starts from the bottom and we...,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",49158,IT,1540369920,"{""id"":1569700626,""name"":""WAO"",""slug"":""wearewao...",EUR,€,False,...,wao-the-eco-effect-shoes,https://www.kickstarter.com/discover/categorie...,True,False,successful,1544309940,1.136525,"{""web"":{""project"":""https://www.kickstarter.com...",49075.15252,international
4,20,Learn to build 10+ Applications in this comple...,"{""id"":51,""name"":""Software"",""slug"":""technology/...",549,US,1425706517,"{""id"":1870845385,""name"":""Kalpit Jain"",""is_regi...",USD,$,True,...,apple-watch-development-course,https://www.kickstarter.com/discover/categorie...,False,False,failed,1428511019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",549.0,domestic


# Data exploration

## Feature overview

| Column | Description | Dtype | Null | EDA status |
| - | - | - | - | - |
| `backers_count` | The number of participants of the kickstarter project | | | Done |
| `blurb` | Short description of the project by the project initiator. Max 135 characters. | | | Done | 
| `category` | Dictionary object. \n Contains `id, name, slug, position, parent_id, color, urls` | | | Open |
| `converted_pledged_amount` | Open | | | Open |
| `country` | Country of of Kickstarter account (?) | | | To be confirmed |
| `created_at`| Creation date of the project. UNIX time format | | | Done |
| `creator`| Dictionary of user details `id, name, is_registered, chosen_currency, ... | | | Open to list all dictionary entries|
| `currency`| Currency (3-digit code) by which the project is founded | | | Done |
| `currency_symbol`| Symbol related to `currency` feature. | | | Done |
| `currency_trailing_code`| | Bool | | Open |
| `current_currency`| Open | | | Open |
| `deadline`| Deadline of the project. UNIX time format | | | |
| `disable_communication`| Is the communication with the creator allowed? | Bool | | Done |
| `friends`| 300 entries with empty list object | | | See '300 question' |
| `fx_rate`| Open | | | Open. Currency topic. |
| `goal`| Target amount | | | By how much deviate the projects from the target? (successful vs. failed) |
| `id`| Intern kickstarter id | | | See '182-anomalia' |
| `is_backing`| 300 instances of 'False' | | | See '300 question' |
| `is_starrable`| Open | Bool | None | Open |
| `is_starred`| Open | | 298 False, 2 True | See '300 question' |
| `launched_at`| Date of publication on kickstarter | int64 | None | Done |
| `location`| Dictionary of location details | String | 208996 filled | Open |
| `name`| Project name. Limited to 60 chars. | String | 209222 filled | Open |
| `permissions`| 300 entries with empty list object ||| See '300 question' |
| `photo`| Dictionary of photo url, ...  | String | | See '182-anomalia' |
| `pledged`|||||
| `profile`|||||
| `slug`|||||
| `source_url`|||||
| `spotlight`|||||
| `staff_pick`|||||
| `state`|||||
| `state_changed_at`|||||
| `static_usd_rate`|||||
| `urls`|||||
| `usd_pledged`|||||
| `usd_type` |||||

## JSON objects

In [4]:
def get_json_cols(data):
    cols_object = df.select_dtypes(include='object').columns
    return [x for x in cols_object if any(data[x].astype(str).str.contains('{\"'))]

In [5]:
def create_dicts_from_json(data, cols_json):
    data_dicts = pd.DataFrame()
    for col in cols_json:
        print('Parsing json in: '+col)
        c = []
        for i, val in data[col].items():
            try:
                c.append(json.loads(val))
            except:
                c.append(dict())
        data_dicts[col] = pd.Series(np.array(c))
    return data_dicts

In [6]:
def create_cols_from_dicts(data_dicts):
    data_expanded = []
    for col in data_dicts.columns:
        print('Expanding: '+col)
        data_expanded.append(pd.json_normalize(data_dicts[col]).add_prefix(col+'_'))
    return pd.concat(data_expanded, axis=1)

In [28]:
def save_dataframe(data, file_name):
    if file_name:
        e = file_name
    else:
        t = datetime.now().strftime("%Y-%m-%d_%H%M%S")
        e = f"./data_frame_{t}.pickle"
    print('Saving: '+e)
    data.to_pickle(e)

In [8]:
def expand_json(data, save=True):
    #from datetime import datetime
    cols_json = get_json_cols(data)
    print('---------- Parsing json ------------')
    data_dicts = create_dicts_from_json(data, cols_json)
    print('---------- Expanding dictionaries --------')
    data_expanded = create_cols_from_dicts(data_dicts)
    print('---------- Merge to final data frame ------')
    data_merged = pd.concat([data.drop(cols_json, axis=1), data_expanded], axis=1)
    if save:
        print('---------- Saving to pickle ------')
        save_dataframe(data_merged)
    return data_merged

In [9]:
df_new = expand_json(df)

---------- Parsing json ------------
Parsing json in: category
Parsing json in: creator
Parsing json in: location
Parsing json in: photo
Parsing json in: profile
Parsing json in: urls
---------- Expanding dictionaries --------
Expanding: category
Expanding: creator
Expanding: location
Expanding: photo
Expanding: profile
Expanding: urls
---------- Merge to final data frame ------
---------- Saving to pickle ------
Saving: ./data_frame_2021-03-11_200154.pickle


In [15]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 98 columns):
 #   Column                                                        Non-Null Count   Dtype  
---  ------                                                        --------------   -----  
 0   backers_count                                                 209222 non-null  int64  
 1   blurb                                                         209214 non-null  object 
 2   converted_pledged_amount                                      209222 non-null  int64  
 3   country                                                       209222 non-null  object 
 4   created_at                                                    209222 non-null  int64  
 5   currency                                                      209222 non-null  object 
 6   currency_symbol                                               209222 non-null  object 
 7   currency_trailing_code                                  

In [11]:
df_new['duration'] = (df_new.deadline-df_new.launched_at)/(3600*24)
df_new['duration'] = df_new['duration'].round(2)

In [12]:
df_new['goal_usd'] = df_new['goal'] * df_new['static_usd_rate']
df_new['goal_usd'] = df_new['goal_usd'].round(2)

In [13]:
df_new['duration'].head()

0    45.00
1    20.00
2    30.04
3    41.96
4    30.00
Name: duration, dtype: float64

In [14]:
df_new['goal_usd'].argmax()

60060

In [36]:
df_new.iloc[60060,:]['urls_web.project']

'https://www.kickstarter.com/projects/245190432/a-celtic-lovestory?ref=discovery_category_newest'

In [37]:
for i , val in df_new.iloc[60060,:].items():
    print(i)
    print(val)
    print()

backers_count
0

blurb
A 2000 year old "Romeo & Juliet" love story, set amidst the dramatic changes that ripped through Celtic Britain when Rome invaded...

converted_pledged_amount
0

country
GB

created_at
1446156106

currency
GBP

currency_symbol
£

currency_trailing_code
False

current_currency
USD

deadline
1448920860

disable_communication
False

friends
nan

fx_rate
1.32567974

goal
100000000.0

id
2000749004

is_backing
nan

is_starrable
False

is_starred
nan

launched_at
1447717635

name
A Celtic Lovestory

permissions
nan

pledged
0.0

slug
a-celtic-lovestory

source_url
https://www.kickstarter.com/discover/categories/film%20&%20video/drama

spotlight
False

staff_pick
False

state
failed

state_changed_at
1448920860

static_usd_rate
1.52350076

usd_pledged
0.0

usd_type
international

category_id
293

category_name
Drama

category_slug
film & video/drama

category_position
5

category_parent_id
11.0

category_color
16734574

category_urls.web.discover
http://www.kickstarter.

In [38]:
df_new['location_type'].unique()

array(['Town', 'Suburb', 'County', 'LocalAdmin', 'Estate', 'Zip',
       'Country', nan, 'Island', 'Miscellaneous'], dtype=object)

In [16]:
df_new['launched_at_full'] = pd.to_datetime(df_new['launched_at'], unit='s')
df_new['launched_at_year'] = pd.DatetimeIndex(df_new['launched_at_full']).year
df_new['launched_at_month'] = pd.DatetimeIndex(df_new['launched_at_full']).month

In [31]:
save_dataframe(df_new, './data_frame_full_2021-03-11_200900')

Saving: ./data_frame_full_2021-03-11_200900


In [32]:
survival_lst = ['blurb', 'country', 'created_at', 'currency', 'deadline','disable_communication', 'goal', 'launched_at','name', 'staff_pick','state', 'usd_pledged','usd_type','category_id','category_name','category_slug','category_parent_id', 'location_id', 'location_name','location_type', 'photo_key', 'photo_full', 'duration', 'goal_usd', 'launched_at_full', 'launched_at_year', 'launched_at_month']

In [33]:
df_eda = df_new[survival_lst]

In [34]:
save_dataframe(df_eda, './data_frame_small_2021-03-11_200900')

Saving: ./data_frame_small_2021-03-11_200900


In [35]:
df_eda.head()

Unnamed: 0,blurb,country,created_at,currency,deadline,disable_communication,goal,launched_at,name,staff_pick,...,location_id,location_name,location_type,photo_key,photo_full,duration,goal_usd,launched_at_full,launched_at_year,launched_at_month
0,2006 was almost 7 years ago.... Can you believ...,US,1387659690,USD,1391899046,False,200.0,1388011046,New Final Round Album,False,...,2379574.0,Chicago,Town,assets/011/625/534/5bea1760d7f20943c4cd5e9b491...,https://ksr-ugc.imgix.net/assets/011/625/534/5...,45.0,200.0,2013-12-25 22:37:26,2013,12
1,An adorable fantasy enamel pin series of princ...,US,1549659768,USD,1551801611,False,400.0,1550073611,Princess Pals Enamel Pin Series,False,...,2486340.0,Sacramento,Town,assets/024/033/030/dea4e3901d10195b035875eb8cf...,https://ksr-ugc.imgix.net/assets/024/033/030/d...,20.0,400.0,2019-02-13 16:00:11,2019,2
2,Helping a community come together to set the s...,US,1477242384,USD,1480607930,False,27224.0,1478012330,Their Life Through Their Lens-the Amish and Me...,True,...,2383660.0,Columbus,Town,assets/014/262/672/97944960ba30239051d3b6e59f2...,https://ksr-ugc.imgix.net/assets/014/262/672/9...,30.04,27224.0,2016-11-01 14:58:50,2016,11
3,Every revolution starts from the bottom and we...,IT,1540369920,EUR,1544309940,False,40000.0,1540684582,WAO: THE ECO EFFECT SHOES,False,...,725746.0,Venice,Town,assets/023/008/626/aef53e2f326ad4c87729001f643...,https://ksr-ugc.imgix.net/assets/023/008/626/a...,41.96,45461.0,2018-10-27 23:56:22,2018,10
4,Learn to build 10+ Applications in this comple...,US,1425706517,USD,1428511017,False,1000.0,1425919017,Apple Watch Development Course,False,...,2479651.0,Redmond,Town,assets/012/061/410/1687a735d5b1316ca4761b087ca...,https://ksr-ugc.imgix.net/assets/012/061/410/1...,30.0,1000.0,2015-03-09 16:36:57,2015,3


In [36]:
df_eda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   blurb                  209214 non-null  object        
 1   country                209222 non-null  object        
 2   created_at             209222 non-null  int64         
 3   currency               209222 non-null  object        
 4   deadline               209222 non-null  int64         
 5   disable_communication  209222 non-null  bool          
 6   goal                   209222 non-null  float64       
 7   launched_at            209222 non-null  int64         
 8   name                   209222 non-null  object        
 9   staff_pick             209222 non-null  bool          
 10  state                  209222 non-null  object        
 11  usd_pledged            209222 non-null  float64       
 12  usd_type               208742 non-null  obje