In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
df_small = pd.read_pickle("./data_frame_small_2021-03-12.pickle")

In [3]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 35 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   backers_count          209222 non-null  int64         
 1   blurb                  209214 non-null  object        
 2   country                209222 non-null  object        
 3   created_at             209222 non-null  int64         
 4   currency               209222 non-null  object        
 5   deadline               209222 non-null  int64         
 6   disable_communication  209222 non-null  bool          
 7   goal                   209222 non-null  float64       
 8   launched_at            209222 non-null  int64         
 9   name                   209222 non-null  object        
 10  staff_pick             209222 non-null  bool          
 11  state                  209222 non-null  object        
 12  usd_pledged            209222 non-null  floa

In [4]:
droplist = ['backers_count', 'name', 'usd_type', 'category_id', 'category_slug', 'category_parent_id', 'deadline', 'staff_pick', 'category_parent_name', 
            'location_id', 'photo_key', 'photo_full', 'goal', 'launched_at_full', 'launched_at_year', 'created_at_full', 'created_at_year', 
            'deadline_full', 'deadline_month', 'deadline_year', 'created_at', 'launched_at', 'usd_pledged', 'disable_communication', 'location_name']

In [5]:
df_model = df_small.drop(droplist, axis = 1)
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              209214 non-null  object 
 1   country            209222 non-null  object 
 2   currency           209222 non-null  object 
 3   state              209222 non-null  object 
 4   category_name      209222 non-null  object 
 5   location_type      208996 non-null  object 
 6   duration           209222 non-null  float64
 7   goal_usd           209222 non-null  float64
 8   launched_at_month  209222 non-null  int64  
 9   created_at_month   209222 non-null  int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 16.0+ MB


In [6]:
df_model[df_model['blurb'].isna()]['state'].value_counts()

canceled    6
failed      2
Name: state, dtype: int64

In [7]:
df_model.dropna(inplace=True)
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208988 entries, 0 to 209221
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              208988 non-null  object 
 1   country            208988 non-null  object 
 2   currency           208988 non-null  object 
 3   state              208988 non-null  object 
 4   category_name      208988 non-null  object 
 5   location_type      208988 non-null  object 
 6   duration           208988 non-null  float64
 7   goal_usd           208988 non-null  float64
 8   launched_at_month  208988 non-null  int64  
 9   created_at_month   208988 non-null  int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 17.5+ MB


In [8]:
df_model = df_model.query("state == 'successful' or state == 'failed'")
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192447 entries, 0 to 209221
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              192447 non-null  object 
 1   country            192447 non-null  object 
 2   currency           192447 non-null  object 
 3   state              192447 non-null  object 
 4   category_name      192447 non-null  object 
 5   location_type      192447 non-null  object 
 6   duration           192447 non-null  float64
 7   goal_usd           192447 non-null  float64
 8   launched_at_month  192447 non-null  int64  
 9   created_at_month   192447 non-null  int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 16.2+ MB


In [9]:
df_model.nunique()

blurb                167641
country                  22
currency                 14
state                     2
category_name           159
location_type             9
duration               5816
goal_usd              45240
launched_at_month        12
created_at_month         12
dtype: int64

In [10]:
df_model.location_type.value_counts()

Town             178409
County             7377
Suburb             4792
LocalAdmin         1145
Zip                 459
Island              220
Country              24
Miscellaneous        16
Estate                5
Name: location_type, dtype: int64

In [11]:
df_model['blurb_len'] = [x.split(" ") for x in df_model.blurb]
df_model['blurb_len'] = [len(i) for i in df_model['blurb_len']]
df_model.head()

Unnamed: 0,blurb,country,currency,state,category_name,location_type,duration,goal_usd,launched_at_month,created_at_month,blurb_len
0,2006 was almost 7 years ago.... Can you believ...,US,USD,successful,Rock,Town,45.0,200.0,12,12,26
1,An adorable fantasy enamel pin series of princ...,US,USD,successful,Mixed Media,Town,20.0,400.0,2,2,9
2,Helping a community come together to set the s...,US,USD,successful,Photobooks,Town,30.04,27224.0,11,10,25
3,Every revolution starts from the bottom and we...,IT,EUR,successful,Footwear,Town,41.96,45461.0,10,10,13
4,Learn to build 10+ Applications in this comple...,US,USD,failed,Software,Town,30.0,1000.0,3,3,22


In [12]:
df_model.drop('blurb', inplace = True, axis = 1)
df_model.nunique()

country                 22
currency                14
state                    2
category_name          159
location_type            9
duration              5816
goal_usd             45240
launched_at_month       12
created_at_month        12
blurb_len               53
dtype: int64

In [13]:
df_model.nunique()

country                 22
currency                14
state                    2
category_name          159
location_type            9
duration              5816
goal_usd             45240
launched_at_month       12
created_at_month        12
blurb_len               53
dtype: int64

In [14]:
country_dummies = pd.get_dummies(df_model['country'], prefix='country', drop_first=True) #create dummie-variables
currency_dummies = pd.get_dummies(df_model['currency'], prefix='currency', drop_first=True)
category_dummies = pd.get_dummies(df_model['category_name'], prefix='category', drop_first=True)
location_dummies = pd.get_dummies(df_model['location_type'], prefix='location', drop_first=True)

In [16]:
df_model = pd.concat([df_model.drop(['country', 'currency', 'category_name', 'location_type'], axis = 1), country_dummies, currency_dummies, category_dummies, location_dummies], axis = 1)

In [17]:
df_model.shape

(192447, 206)

In [16]:
list(df_model.columns)

['state',
 'duration',
 'goal_usd',
 'launched_at_month',
 'created_at_month',
 'blurb_len',
 'country_AU',
 'country_BE',
 'country_CA',
 'country_CH',
 'country_DE',
 'country_DK',
 'country_ES',
 'country_FR',
 'country_GB',
 'country_HK',
 'country_IE',
 'country_IT',
 'country_JP',
 'country_LU',
 'country_MX',
 'country_NL',
 'country_NO',
 'country_NZ',
 'country_SE',
 'country_SG',
 'country_US',
 'currency_CAD',
 'currency_CHF',
 'currency_DKK',
 'currency_EUR',
 'currency_GBP',
 'currency_HKD',
 'currency_JPY',
 'currency_MXN',
 'currency_NOK',
 'currency_NZD',
 'currency_SEK',
 'currency_SGD',
 'currency_USD',
 'category_Academic',
 'category_Accessories',
 'category_Action',
 'category_Animals',
 'category_Animation',
 'category_Anthologies',
 'category_Apparel',
 'category_Apps',
 'category_Architecture',
 'category_Art',
 'category_Art Books',
 'category_Audio',
 'category_Bacon',
 'category_Blues',
 'category_Calendars',
 'category_Camera Equipment',
 'category_Candles',

In [24]:
dic = {'successful' : 1, 'failed' : 0}
df_model.state = df_model.state.map(dic)
df_model.shape

(192447, 206)

In [22]:
X = df_model.drop('state', axis=1)
y = df_model.state


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [26]:
X_train.to_csv('X_train_Kickstarter.csv', index = False)
X_test.to_csv('X_test_Kickstarter.csv', index = False)
y_train.to_csv('y_train_Kickstarter.csv', index = False)
y_test.to_csv('y_test_Kickstarter.csv', index = False)