In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

np.random.seed(0)

In [2]:
data = pd.read_csv('kickstarter.csv')
data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0


In [3]:
data = data.assign(outcome=(data['state'] == 'successful').astype(int))
data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0


In [4]:
data['outcome'].dtype

dtype('int32')

In [5]:
data.shape

(378661, 16)

In [6]:
nums = [col for col in data.columns if data[col].dtype in ['int32', 'int64', 'float64']]
cats = [col for col in data.columns if data[col].dtype =='object']

In [7]:
num_data = data[nums]
num_data.head(3)

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,1000.0,0.0,0,0.0,0.0,1533.95,0
1,1000003930,30000.0,2421.0,15,100.0,2421.0,30000.0,0
2,1000004038,45000.0,220.0,3,220.0,220.0,45000.0,0


In [8]:
num_data.isnull().sum()

ID                     0
goal                   0
pledged                0
backers                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
outcome                0
dtype: int64

In [9]:
num_x = num_data[['goal', 'pledged', 'backers']]
num_x.head()

Unnamed: 0,goal,pledged,backers
0,1000.0,0.0,0
1,30000.0,2421.0,15
2,45000.0,220.0,3
3,5000.0,1.0,1
4,19500.0,1283.0,14


In [10]:
y = num_data.outcome

In [11]:
cat_data = data[cats]
cat_data.head(3)

Unnamed: 0,name,category,main_category,currency,deadline,launched,state,country
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,US
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,US


In [12]:
select_cats = ['category', 'currency', 'country']
cat_x = data[select_cats]
cat_x.head()

Unnamed: 0,category,currency,country
0,Poetry,GBP,GB
1,Narrative Film,USD,US
2,Narrative Film,USD,US
3,Music,USD,US
4,Film & Video,USD,US


In [13]:
encoder = LabelEncoder()

In [14]:
X = num_x.join(cat_x.apply(encoder.fit_transform))
X.head()

Unnamed: 0,goal,pledged,backers,category,currency,country
0,1000.0,0.0,0,108,5,9
1,30000.0,2421.0,15,93,13,22
2,45000.0,220.0,3,93,13,22
3,5000.0,1.0,1,90,13,22
4,19500.0,1283.0,14,55,13,22


In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=11)

In [16]:
X_train.shape

(283995, 6)

In [17]:
X_valid.shape

(94666, 6)

# DecisionTree

In [18]:
dt_model = DecisionTreeRegressor(random_state=0)
dt_model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [19]:
dt_predictions = dt_model.predict(X_valid)

In [21]:
roc = roc_auc_score(y_valid, dt_predictions)
print('DecisionTree model')
print('MAE >> ', roc)

DecisionTree model
MAE >>  0.9898741651340024


# RandomForest

# SupportVector

# KNeighbors

# Deep Learning
## > Tensorflow & Keras