In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

import category_encoders as ce
from sklearn.model_selection import train_test_split

from sklearn import metrics

np.random.seed(0)

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])
data.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0


In [3]:
data = data.query("state !='live'")

In [4]:
data.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3796
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [5]:
data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day,
                  month=data.launched.dt.month,
                  year=data.launched.dt.year,
                  outcome=(data.state == 'successful').astype('int64'))
data.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,hour,day,month,year,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,12,11,8,2015,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,4,2,9,2017,0


In [6]:
num_features = ['hour', 'day', 'month', 'year', 'goal', 'pledged', 'backers']

In [7]:
cat_features = ['category', 'currency', 'country']

In [8]:
cols = num_features + cat_features

X = data[cols]
X.head(3)

Unnamed: 0,hour,day,month,year,goal,pledged,backers,category,currency,country
0,12,11,8,2015,1000.0,0.0,0,Poetry,GBP,GB
1,4,2,9,2017,30000.0,2421.0,15,Narrative Film,USD,US
2,0,12,1,2013,45000.0,220.0,3,Narrative Film,USD,US


In [9]:
print(len(X))
y = data.outcome
len(y)

375862


375862

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

In [11]:
encoder = ce.CountEncoder(cols=cat_features)

In [12]:
encoder.fit(X_train[cat_features], y_train)

CountEncoder(cols=['category', 'currency', 'country'],
             combine_min_nan_groups=True, drop_invariant=False,
             handle_missing='count', handle_unknown=None, min_group_name=None,
             min_group_size=None, normalize=False, return_df=True, verbose=0)

In [14]:
X_train = X_train.join(encoder.transform(X_train[cat_features]).add_suffix('_count'))
X_valid = X_valid.join(encoder.transform(X_valid[cat_features]).add_suffix('_count'))

X_train.head(3)

Unnamed: 0,hour,day,month,year,goal,pledged,backers,category,currency,country,category_count,currency_count,country_count
60011,17,22,9,2015,15000.0,37545.0,834,Product Design,USD,US,17644,234835,232628
134505,18,29,7,2010,3750.0,6176.59,125,Indie Rock,USD,US,4550,234835,232628
248356,21,18,9,2016,11284.0,235.0,8,Illustration,EUR,DE,2514,13663,3308


In [15]:
X_train = X_train.drop(cat_features, axis=1)
X_valid = X_valid.drop(cat_features, axis=1)

X_train.head(3)

Unnamed: 0,hour,day,month,year,goal,pledged,backers,category_count,currency_count,country_count
60011,17,22,9,2015,15000.0,37545.0,834,17644,234835,232628
134505,18,29,7,2010,3750.0,6176.59,125,4550,234835,232628
248356,21,18,9,2016,11284.0,235.0,8,2514,13663,3308


In [17]:
model = RandomForestClassifier(n_estimators=250, random_state=11)

In [18]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=11, verbose=0,
                       warm_start=False)

In [21]:
preds = model.predict(X_valid)
score = metrics.roc_auc_score(y_valid, preds)
print('RandomForest score: ', score)

RandomForest score:  0.9959161861100025


In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
model_2 = DecisionTreeClassifier()
model_2.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
preds = model_2.predict(X_valid)
score = metrics.roc_auc_score(y_valid, preds)
print('DecisionTree score: ', score)

DecisionTree score:  0.9923674855274516


In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
model_3 = KNeighborsClassifier()
model_3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
preds = model_3.predict(X_valid)
score = metrics.roc_auc_score(y_valid, preds)
print('KNeighbors score: ', score)

KNeighbors score:  0.9931416425421069


In [29]:
import lightgbm as lgb

param = {'num_leaves': 33, 'objective': 'binary', 'metric': 'auc'}
num_rounds = 450

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

model_lgb = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=10)

[LightGBM] [Info] Number of positive: 107174, number of negative: 193515
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 300689, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356428 -> initscore=-0.590901
[LightGBM] [Info] Start training from score -0.590901
[1]	valid_0's auc: 0.99248
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.995178
[3]	valid_0's auc: 0.996073
[4]	valid_0's auc: 0.996287
[5]	valid_0's auc: 0.997135
[6]	valid_0's auc: 0.997378
[7]	valid_0's auc: 0.997852
[8]	valid_0's auc: 0.997883
[9]	valid_0's auc: 0.997884
[10]	valid_0's auc: 0.998093
[11]	valid_0's auc: 0.99811
[12]	valid_0's auc: 0.99812
[13]	valid_0's auc: 0.998195
[14]	valid_0's auc: 0.998226
[15]	valid_0's auc: 0.998253
[16]	valid_0's auc: 0.998279
[17]	valid_0's auc: 0.998306
[

### Projects in the last week

In [32]:
launched = pd.Series(data.index, index=data.launched, name='count_7_days').sort_index()
launched.head()

launched
1970-01-01 01:00:00     94579
1970-01-01 01:00:00    319002
1970-01-01 01:00:00    247913
1970-01-01 01:00:00     48147
1970-01-01 01:00:00     75397
Name: count_7_days, dtype: int64

In [33]:
count_7_days = launched.rolling('7d').count() - 1
count_7_days.head()

launched
1970-01-01 01:00:00    0.0
1970-01-01 01:00:00    1.0
1970-01-01 01:00:00    2.0
1970-01-01 01:00:00    3.0
1970-01-01 01:00:00    4.0
Name: count_7_days, dtype: float64

In [34]:
count_7_days.index = launched.values
count_7_days.head()

94579     0.0
319002    1.0
247913    2.0
48147     3.0
75397     4.0
Name: count_7_days, dtype: float64

In [35]:
count_7_days = count_7_days.reindex(data.index)
count_7_days.head()

0    1409.0
1     957.0
2     739.0
3     907.0
4    1429.0
Name: count_7_days, dtype: float64

### Time since last project