In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import category_encoders as ce
from sklearn import metrics

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])
data.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0


In [3]:
data = data.query("state != 'live'")

In [4]:
data = data.assign(outcome=(data.state == 'successful').astype(int))
data.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0


In [5]:
cat_features = ['category', 'currency', 'country']

In [6]:
data = data.assign(hour=data.launched.dt.hour, 
                  day=data.launched.dt.day,
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)

In [7]:
X = data[['hour', 'day', 'month', 'year', 'goal', 'outcome', 'category', 'currency', 'country']]
X.head(7)

Unnamed: 0,hour,day,month,year,goal,outcome,category,currency,country
0,12,11,8,2015,1000.0,0,Poetry,GBP,GB
1,4,2,9,2017,30000.0,0,Narrative Film,USD,US
2,0,12,1,2013,45000.0,0,Narrative Film,USD,US
3,3,17,3,2012,5000.0,0,Music,USD,US
4,8,4,7,2015,19500.0,0,Film & Video,USD,US
5,13,26,2,2016,50000.0,1,Restaurants,USD,US
6,18,1,12,2014,1000.0,1,Food,USD,US


In [8]:
valid_fraction = 0.1
valid_size = int(len(X) * valid_fraction)

train = X[:-2 * valid_size]
valid = X[-2 * valid_size: -valid_size]
test = X[-valid_size:]

valid.head(3)

Unnamed: 0,hour,day,month,year,goal,outcome,category,currency,country
302896,16,12,6,2015,5000.0,1,Documentary,USD,US
302897,1,8,7,2013,3700.0,1,Fiction,USD,US
302898,22,27,5,2014,5500.0,1,Music,GBP,GB


In [9]:
sub_tr = train.drop(cat_features, axis=1)
sub_va = valid.drop(cat_features, axis=1)
sub_te = test.drop(cat_features, axis=1)

In [10]:
encoder = ce.TargetEncoder(cols=cat_features)

In [11]:
encoder.fit(train[cat_features], train['outcome'])

TargetEncoder(cols=['category', 'currency', 'country'], drop_invariant=False,
              handle_missing='value', handle_unknown='value',
              min_samples_leaf=1, return_df=True, smoothing=1.0, verbose=0)

In [12]:
TE_train = sub_tr.join(encoder.transform(train[cat_features]).add_suffix('_target'))
TE_valid = sub_va.join(encoder.transform(valid[cat_features]).add_suffix('_target'))
TE_test = sub_te.join(encoder.transform(test[cat_features]).add_suffix('_target'))

TE_train.head()

Unnamed: 0,hour,day,month,year,goal,outcome,category_target,currency_target,country_target
0,12,11,8,2015,1000.0,0,0.36019,0.357122,0.361636
1,4,2,9,2017,30000.0,0,0.384615,0.373392,0.376631
2,0,12,1,2013,45000.0,0,0.384615,0.373392,0.376631
3,3,17,3,2012,5000.0,0,0.412655,0.373392,0.376631
4,8,4,7,2015,19500.0,0,0.302625,0.373392,0.376631


In [13]:
features = TE_train.columns.drop(['outcome'])
features

Index(['hour', 'day', 'month', 'year', 'goal', 'category_target',
       'currency_target', 'country_target'],
      dtype='object')

In [14]:
TE_train[features].head()

Unnamed: 0,hour,day,month,year,goal,category_target,currency_target,country_target
0,12,11,8,2015,1000.0,0.36019,0.357122,0.361636
1,4,2,9,2017,30000.0,0.384615,0.373392,0.376631
2,0,12,1,2013,45000.0,0.384615,0.373392,0.376631
3,3,17,3,2012,5000.0,0.412655,0.373392,0.376631
4,8,4,7,2015,19500.0,0.302625,0.373392,0.376631


In [15]:
dtrain = lgb.Dataset(TE_train[features], TE_train['outcome'])
dvalid = lgb.Dataset(TE_valid[features], TE_valid['outcome'])

dtrain

<lightgbm.basic.Dataset at 0x2194dd04388>

In [16]:
param = {'num_leaves': 65, 'objective': 'binary', 'metric': 'auc'}
num_rounds = 1000

In [17]:
model = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=5)

[LightGBM] [Info] Number of positive: 107340, number of negative: 193350
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 300690, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
[1]	valid_0's auc: 0.723307
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.724935
[3]	valid_0's auc: 0.72626
[4]	valid_0's auc: 0.727261
[5]	valid_0's auc: 0.729598
[6]	valid_0's auc: 0.730533
[7]	valid_0's auc: 0.730963
[8]	valid_0's auc: 0.731206
[9]	valid_0's auc: 0.731789
[10]	valid_0's auc: 0.732178
[11]	valid_0's auc: 0.732553
[12]	valid_0's auc: 0.733249
[13]	valid_0's auc: 0.733795
[14]	valid_0's auc: 0.73402
[15]	valid_0's auc: 0.734569
[16]	valid_0's auc: 0.73488
[17]	valid_0's auc: 0.735359
[18]

In [18]:
preds = model.predict(TE_test[features])
score = metrics.roc_auc_score(TE_test['outcome'], preds)
score

0.7452336045472924