In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

In [2]:
data = pd.read_csv('kickstarter.csv', parse_dates=['launched'])
data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0


In [3]:
data.launched.dtype

dtype('<M8[ns]')

In [4]:
data.launched.head(3)

0   2015-08-11 12:12:28
1   2017-09-02 04:43:57
2   2013-01-12 00:20:50
Name: launched, dtype: datetime64[ns]

In [5]:
# Adding a new column for the outcome
# Assigning 'successful' == 1 and others == 0

data = data.assign(outcome=(data['state'] == 'successful').astype(int))

data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0


In [6]:
# After converting dates
# Creating new cols for day, hour, month and year
# Assigning the appropriate values from the launched col <timestamp>

data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day, 
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)

data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013


In [7]:
cat_features = ['category', 'currency', 'country']

In [8]:
encoder = LabelEncoder()
enc_cats = data[cat_features].apply(encoder.fit_transform)

In [9]:
X = data[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(enc_cats)
X.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


In [10]:
valid_fraction = 0.1

valid_size = int(len(X) * valid_fraction)

train = X[:-2* valid_size]

valid = X[-2*valid_size:-valid_size]

test = X[-valid_size:]

train.shape

(302929, 9)

In [11]:
# Custome function to do the heavy lifting
def get_data_splits(df):
    valid_fraction = 0.1

    valid_size = int(len(df) * valid_fraction)

    train = df[:-2* valid_size]

    valid = df[-2*valid_size:-valid_size]

    test = df[-valid_size:]
    
    print(train.shape)

In [12]:
# train, valid, test = get_data_splits(X)

In [13]:
type(X)

pandas.core.frame.DataFrame

In [14]:
get_data_splits(X)


(302929, 9)


# BAseline Model

In [15]:
feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])

dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000

bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5, verbose_eval=False)

print(f'THe BEst AUC Score: {bst.best_score}')

[LightGBM] [Info] Number of positive: 107352, number of negative: 195577
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 529
[LightGBM] [Info] Number of data points in the train set: 302929, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354380 -> initscore=-0.599841
[LightGBM] [Info] Start training from score -0.599841
THe BEst AUC Score: defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('auc', 0.7484315670749857)])})


# ULtimAte DEf...

In [16]:
# Ultimate def

def the_ultimate(df):
    
    valid_fraction = 0.1

    valid_size = int(len(df) * valid_fraction)

    train = df[:-2* valid_size]

    valid = df[-2*valid_size:-valid_size]

    test = df[-valid_size:]
    
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])

    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary'}
    param['metric'] = 'auc'
    num_round = 1000

    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5, verbose_eval=False)

    print(f'THe Validation AUC Score >> {bst.best_score}')

In [17]:
the_ultimate(X)

[LightGBM] [Info] Number of positive: 107352, number of negative: 195577
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 529
[LightGBM] [Info] Number of data points in the train set: 302929, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354380 -> initscore=-0.599841
[LightGBM] [Info] Start training from score -0.599841
THe Validation AUC Score >> defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('auc', 0.7484315670749857)])})


# New CAtegorical ENCoding TrIcks
 * Count encoding
 * Target encoding
 * CatBoost encoding

In [18]:
data_2 = data[['goal', 'hour', 'day', 'month', 'year', 'outcome']]
data_2.head(3)

Unnamed: 0,goal,hour,day,month,year,outcome
0,1000.0,12,11,8,2015,0
1,30000.0,4,2,9,2017,0
2,45000.0,0,12,1,2013,0


In [19]:
# FEaTures to Encode..

data[cat_features].head()

Unnamed: 0,category,currency,country
0,Poetry,GBP,GB
1,Narrative Film,USD,US
2,Narrative Film,USD,US
3,Music,USD,US
4,Film & Video,USD,US


In [20]:
data[cat_features].shape

(378661, 3)

In [21]:
# COunt ENcOding

count_enc = ce.CountEncoder()

# Transform the features, rename the columns with the _count suffix, 
# And join to dataframe

count_encoded = count_enc.fit_transform(data[cat_features])
count_encoded.head()

Unnamed: 0,category,currency,country
0,1369,34132,33672
1,5188,295365,292627
2,5188,295365,292627
3,15727,295365,292627
4,10108,295365,292627


In [22]:
# AwesOmE
new_X = data_2.join(count_encoded.add_suffix('_count'))
new_X.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category_count,currency_count,country_count
0,1000.0,12,11,8,2015,0,1369,34132,33672
1,30000.0,4,2,9,2017,0,5188,295365,292627
2,45000.0,0,12,1,2013,0,5188,295365,292627
3,5000.0,3,17,3,2012,0,15727,295365,292627
4,19500.0,8,4,7,2015,0,10108,295365,292627


In [23]:
# Model and evaluation with Count ENc
the_ultimate(new_X)

[LightGBM] [Info] Number of positive: 107352, number of negative: 195577
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 302929, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354380 -> initscore=-0.599841
[LightGBM] [Info] Start training from score -0.599841
THe Validation AUC Score >> defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('auc', 0.7498353031744174)])})


Count enc improved model from  0.7484 to the above score

Slight improvement