In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime

In [4]:
df_full = pd.read_pickle("./data_frame_full_2021-03-11_200900")

In [5]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Columns: 101 entries, backers_count to launched_at_month
dtypes: bool(5), datetime64[ns](1), float64(18), int64(12), object(65)
memory usage: 154.2+ MB


In [6]:
df_small = pd.read_pickle("./data_frame_small_2021-03-11_200900")

In [7]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   blurb                  209214 non-null  object        
 1   country                209222 non-null  object        
 2   created_at             209222 non-null  int64         
 3   currency               209222 non-null  object        
 4   deadline               209222 non-null  int64         
 5   disable_communication  209222 non-null  bool          
 6   goal                   209222 non-null  float64       
 7   launched_at            209222 non-null  int64         
 8   name                   209222 non-null  object        
 9   staff_pick             209222 non-null  bool          
 10  state                  209222 non-null  object        
 11  usd_pledged            209222 non-null  float64       
 12  usd_type               208742 non-null  obje

## Target classes and balance

In [13]:
vals = df_full['state'].value_counts()
vals

successful    117465
failed         75199
canceled        8624
live            7311
suspended        623
Name: state, dtype: int64

In [19]:
print('Portion of success: {:.2f}%'.format( 100*vals[0] / (vals[0]+vals[1]) ))
print('Portion of fail:    {:.2f}%'.format( 100*vals[1] / (vals[0]+vals[1]) ))

Portion of success: 60.97%
Portion of fail:    39.03%


We have a sligth imbalance.

## Date time

In [21]:
pd.to_datetime(df_full['state_changed_at'], unit='s').describe(datetime_is_numeric=True)

count                           209222
mean     2016-05-09 23:50:25.225296640
min                2009-05-03 07:00:17
25%      2015-01-05 19:16:23.750000128
50%                2016-05-31 15:31:38
75%                2018-02-23 06:04:15
max                2019-03-14 04:12:21
Name: state_changed_at, dtype: object

## Country

In [22]:
ratio = lambda x: 100*round(x.value_counts()['successful'] / (x.value_counts()['successful'] + x.value_counts()['failed'] + x.value_counts()['canceled']), 4)

In [None]:
success_per_country = pd.pivot_table(df, index='country', values='state', aggfunc=ratio).rename(columns={'state':'success_ratio'}).sort_values(by='success_ratio', ascending=False)
success_per_country

In [None]:
from matplotlib.pyplot import xcorr
sns.barplot(data=success_per_country.reset_index(), y='country', x='success_ratio')

## Amounts: converted_pledged_amount, exchange rate

In [8]:
currency_delta = df['converted_pledged_amount'].astype(float) - df['usd_pledged'] * df['static_usd_rate']
currency_delta.describe()

count    2.092220e+05
mean    -8.551623e+01
std      1.270609e+04
min     -1.771501e+06
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      9.330207e+05
dtype: float64

In [10]:
currency_cols = ['country', 'converted_pledged_amount', 'currency', 'currency_symbol', 'fx_rate', 'usd_pledged', 'usd_type']
df[currency_delta > 900][currency_cols].head(2)

Unnamed: 0,country,converted_pledged_amount,currency,currency_symbol,fx_rate,usd_pledged,usd_type
56,SG,19699,SGD,$,0.737115,19338.58604,international
83,NO,2182,NOK,kr,0.11572,2088.859902,domestic
