In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce
import lightgbm as lgb
from sklearn import metrics

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])

In [3]:
data = data.assign(outcome=(data.state == 'successful').astype(int))
data.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0


In [4]:
data.shape

(378661, 16)

In [5]:
data = data.query("state != 'live'")
data.shape

(375862, 16)

In [6]:
data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day,
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)
data.shape

(375862, 20)

In [7]:
cat_features = ['category', 'currency', 'country']

In [8]:
encoder = ce.CountEncoder()

In [9]:
num_X = data[['hour', 'day', 'month', 'year', 'goal', 'outcome']]

In [10]:
encoded = encoder.fit_transform(data[cat_features])
encoded.head()

Unnamed: 0,category,currency,country
0,1362,33853,33393
1,5174,293624,290887
2,5174,293624,290887
3,15647,293624,290887
4,10054,293624,290887


In [11]:
X = num_X.join(encoded.add_suffix('_count'))

X.head()

Unnamed: 0,hour,day,month,year,goal,outcome,category_count,currency_count,country_count
0,12,11,8,2015,1000.0,0,1362,33853,33393
1,4,2,9,2017,30000.0,0,5174,293624,290887
2,0,12,1,2013,45000.0,0,5174,293624,290887
3,3,17,3,2012,5000.0,0,15647,293624,290887
4,8,4,7,2015,19500.0,0,10054,293624,290887


In [12]:
valid_fraction = 0.1
valid_size = int(len(X) * valid_fraction)

train = X[: -2* valid_size]
valid = X[-2*valid_size: -valid_size]
test = X[-valid_size:]
test.shape

(37586, 9)

In [13]:
param = {'num_leaves': 65, 'objective': 'binary', 'metrics': 'auc'}
num_rounds = 1000

In [14]:
features = train.columns.drop(['outcome'])

In [15]:
dtrain = lgb.Dataset(train[features], train['outcome'])
dvalid = lgb.Dataset(valid[features], valid['outcome'])

In [16]:
model = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=5)

[LightGBM] [Info] Number of positive: 107340, number of negative: 193350
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 526
[LightGBM] [Info] Number of data points in the train set: 300690, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
[1]	valid_0's auc: 0.692744
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.695987
[3]	valid_0's auc: 0.702229
[4]	valid_0's auc: 0.704395
[5]	valid_0's auc: 0.706554
[6]	valid_0's auc: 0.708559
[7]	valid_0's auc: 0.711188
[8]	valid_0's auc: 0.712724
[9]	valid_0's auc: 0.714062
[10]	valid_0's auc: 0.715033
[11]	valid_0's auc: 0.715866
[12]	valid_0's auc: 0.716983
[13]	valid_0's auc: 0.717887
[14]	valid_0's auc: 0.718854
[15]	valid_0's auc: 0.719398
[16]	valid_0's auc: 0.72024
[17]	valid_0's auc: 0.721627
[18]	valid_0's auc: 0.72202
[19]	valid_0's auc: 0.722583
[20]	vali

In [17]:
preds = model.predict(test[features])
score = metrics.roc_auc_score(test['outcome'], preds)
score

0.7474402428057598