In [1]:
import pandas as pd

import lightgbm as lgb

import category_encoders as ce

from sklearn import metrics

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])
data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0


In [3]:
# data.info()

In [4]:
# Creating the outcome col
data = data.assign(outcome=(data['state'] == 'successful').astype(int))
# data.head(3)

In [5]:
data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day, 
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)

# data.head(3)

In [6]:
cat_features = ['category', 'currency', 'country']

In [7]:
X = data[['goal', 'hour', 'day', 'month', 'year', 'outcome', 'category', 'currency', 'country']]
X.head(3)

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,Poetry,GBP,GB
1,30000.0,4,2,9,2017,0,Narrative Film,USD,US
2,45000.0,0,12,1,2013,0,Narrative Film,USD,US


In [8]:
valid_fraction = 0.1

valid_size = int(len(X) * valid_fraction)

train = X[:-2 * valid_size]

valid = X[-2 * valid_size: -valid_size]

test = X[-valid_size:]

In [9]:
train.shape

(302929, 9)

In [10]:
# Not including the validation and test sets
# They'll learn the encoding from the training

target_enc = ce.TargetEncoder(cols=cat_features)

target_enc.fit(train[cat_features], train['outcome'])

TargetEncoder(cols=['category', 'currency', 'country'], drop_invariant=False,
              handle_missing='value', handle_unknown='value',
              min_samples_leaf=1, return_df=True, smoothing=1.0, verbose=0)

In [11]:
encoded_train = train.join(target_enc.transform(train[cat_features]).add_suffix("_target"))
encoded_valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix("_target"))
encoded_train.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country,category_target,currency_target,country_target
0,1000.0,12,11,8,2015,0,Poetry,GBP,GB,0.357815,0.354235,0.358672
1,30000.0,4,2,9,2017,0,Narrative Film,USD,US,0.383693,0.37121,0.37441
2,45000.0,0,12,1,2013,0,Narrative Film,USD,US,0.383693,0.37121,0.37441
3,5000.0,3,17,3,2012,0,Music,USD,US,0.410499,0.37121,0.37441
4,19500.0,8,4,7,2015,0,Film & Video,USD,US,0.300985,0.37121,0.37441


In [12]:
encoded_train.shape

(302929, 12)

In [13]:
feature_cols = encoded_train.columns.drop(['outcome', 'category', 'currency', 'country'])
feature_cols

Index(['goal', 'hour', 'day', 'month', 'year', 'category_target',
       'currency_target', 'country_target'],
      dtype='object')

In [14]:

dtrain = lgb.Dataset(encoded_train[feature_cols], label=encoded_train['outcome'])

dvalid = lgb.Dataset(encoded_valid[feature_cols], label=encoded_valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000

In [15]:
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5)

[LightGBM] [Info] Number of positive: 107352, number of negative: 195577
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 302929, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354380 -> initscore=-0.599841
[LightGBM] [Info] Start training from score -0.599841
[1]	valid_0's auc: 0.723061
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.7253
[3]	valid_0's auc: 0.726151
[4]	valid_0's auc: 0.727132
[5]	valid_0's auc: 0.728909
[6]	valid_0's auc: 0.730085
[7]	valid_0's auc: 0.730711
[8]	valid_0's auc: 0.731268
[9]	valid_0's auc: 0.731788
[10]	valid_0's auc: 0.732419
[11]	valid_0's auc: 0.733156
[12]	valid_0's auc: 0.733796
[13]	valid_0's auc: 0.734331
[14]	valid_0's auc: 0.734622
[15]	valid_0's auc: 0.735064
[16]	valid_0's auc: 0.735547
[17]	valid_0's auc: 0.735928
[18

In [16]:
cats = ['currency', 'country', 'category']
sub_train = train.drop(cats, axis=1)

sub_train.head(3)

Unnamed: 0,goal,hour,day,month,year,outcome
0,1000.0,12,11,8,2015,0
1,30000.0,4,2,9,2017,0
2,45000.0,0,12,1,2013,0


In [17]:
TE_train = sub_train.join(target_enc.transform(train[cats], train['outcome']).add_suffix('_target'))
TE_valid = sub_train.join(target_enc.transform(valid[cats], valid['outcome']).add_suffix('_target'))

TE_train.head(3)

Unnamed: 0,goal,hour,day,month,year,outcome,currency_target,country_target,category_target
0,1000.0,12,11,8,2015,0,0.354235,0.358672,0.357815
1,30000.0,4,2,9,2017,0,0.37121,0.37441,0.383693
2,45000.0,0,12,1,2013,0,0.37121,0.37441,0.383693


In [18]:
features = TE_train.columns.drop('outcome')
features

Index(['goal', 'hour', 'day', 'month', 'year', 'currency_target',
       'country_target', 'category_target'],
      dtype='object')

In [19]:
# dtrain = lgb.Dataset(TE_train[features], label=TE_train['outcome'])
# dvalid = lgb.Dataset(TE_valid[features], label=TE_valid['outcome'])

# param = {'num_leaves': 64, 'objective': 'binary'}
# param['metric'] = 'auc'
# num_round = 1000

# # bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5)

In [20]:
# def train_model(train, valid):
#     feature_cols = train.columns.drop('outcome')
    
#     dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])

#     dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

#     param = {'num_leaves': 64, 'objective': 'binary'}
#     param['metric'] = 'auc'
#     num_round = 1000
#     bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5)
#     print(train.head())
#     print(f'Validation AUC Score: {bst.best_score}')

In [21]:
# train_model(TE_train, TE_valid)

In [22]:
# CatBoost Encoding...

targ_enc = ce.CatBoostEncoder(cols=cat_features)
targ_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = train.join(targ_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_CBE = valid.join(targ_enc.transform(valid[cat_features]).add_suffix('_cb'))

train_CBE.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country,category_cb,currency_cb,country_cb
0,1000.0,12,11,8,2015,0,Poetry,GBP,GB,0.357812,0.354235,0.358672
1,30000.0,4,2,9,2017,0,Narrative Film,USD,US,0.383686,0.37121,0.37441
2,45000.0,0,12,1,2013,0,Narrative Film,USD,US,0.383686,0.37121,0.37441
3,5000.0,3,17,3,2012,0,Music,USD,US,0.410495,0.37121,0.37441
4,19500.0,8,4,7,2015,0,Film & Video,USD,US,0.300992,0.37121,0.37441


In [23]:
new_feats = train_CBE.columns.drop(['outcome', 'category', 'currency', 'country'])
new_feats

Index(['goal', 'hour', 'day', 'month', 'year', 'category_cb', 'currency_cb',
       'country_cb'],
      dtype='object')

In [24]:
dtrain = lgb.Dataset(train_CBE[new_feats], label=train_CBE['outcome'])
dvalid = lgb.Dataset(valid_CBE[new_feats], label=valid_CBE['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000

bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5)

[LightGBM] [Info] Number of positive: 107352, number of negative: 195577
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 302929, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354380 -> initscore=-0.599841
[LightGBM] [Info] Start training from score -0.599841
[1]	valid_0's auc: 0.723061
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.725318
[3]	valid_0's auc: 0.726154
[4]	valid_0's auc: 0.727239
[5]	valid_0's auc: 0.729029
[6]	valid_0's auc: 0.730172
[7]	valid_0's auc: 0.730814
[8]	valid_0's auc: 0.731605
[9]	valid_0's auc: 0.732087
[10]	valid_0's auc: 0.732717
[11]	valid_0's auc: 0.733341
[12]	valid_0's auc: 0.733614
[13]	valid_0's auc: 0.734199
[14]	valid_0's auc: 0.734684
[15]	valid_0's auc: 0.735245
[16]	valid_0's auc: 0.735703
[17]	valid_0's auc: 0.736099
[18]	valid_0's auc: 0.736564
[19]	valid_0's auc: 0.73713
[20]	val