In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])
data.shape

(378661, 15)

In [3]:
cat_features = ['category', 'currency', 'country']

In [4]:
data = data.query("state != 'live'")
data.shape

(375862, 15)

In [6]:
data = data.assign(outcome=(data["state"] == 'successful').astype(int))
data.head(7)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,1
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.0,successful,16,US,1205.0,1205.0,1000.0,1


In [6]:
encoder = LabelEncoder()

In [7]:
encoded = data[cat_features].apply(encoder.fit_transform)
encoded.head()

Unnamed: 0,category,currency,country
0,108,5,9
1,93,13,22
2,93,13,22
3,90,13,22
4,55,13,22


In [8]:
data = data.drop(cat_features, axis=1)
data.head(2)

Unnamed: 0,ID,name,main_category,deadline,goal,launched,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Publishing,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Film & Video,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,100.0,2421.0,30000.0,0


In [9]:
data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day,
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)

data.head(2)

Unnamed: 0,ID,name,main_category,deadline,goal,launched,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Publishing,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Film & Video,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,100.0,2421.0,30000.0,0,4,2,9,2017


In [10]:
X = data[['hour', 'day', 'month', 'year', 'goal', 'outcome']].join(encoded)

X.head(3)

Unnamed: 0,hour,day,month,year,goal,outcome,category,currency,country
0,12,11,8,2015,1000.0,0,108,5,9
1,4,2,9,2017,30000.0,0,93,13,22
2,0,12,1,2013,45000.0,0,93,13,22


In [15]:
def get_splits(df, valid_fraction=0.1):
    valid_rows = int(len(df)*valid_fraction)
    train = df[:-2 * valid_rows]3
    valid = df[-valid_rows*2: -valid_rows]
    test = df[-valid_rows:]
    
    return train, valid, test

In [16]:
train, valid, test = get_splits(X)
train.shape

(300690, 9)

In [13]:
def train_model(train, valid, test, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['outcome'])
        
    dtrain = lgb.Dataset(train[feature_cols], train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], valid['outcome'])
    
    param = {'num_leaves': 55, 'objective': 'binary', 'metric': 'auc', 'seed': 7}
    num_rounds = 1000
    
    model = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=10, 
                     verbose_eval=False)
    valid_pred = model.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    
    print(f"Validation AUC Score: {valid_score}")
    
    test_pred = model.predict(test[feature_cols])
    test_score = metrics.roc_auc_score(test['outcome'], test_pred)
    
    print(f"Test AUC Score: {test_score}")
    return model, valid_score

In [14]:
train_model(train, valid, test)

[LightGBM] [Info] Number of positive: 107340, number of negative: 193350
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 300690, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
Validation AUC Score: 0.7471733354101955
Test AUC Score: 0.7467100676831768


(<lightgbm.basic.Booster at 0x23868800088>, 0.7471733354101955)