In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# read the data
ks = pd.read_csv('../../data/ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

# drop live projects
ks = ks.query('state != "live"')

# add outcome column, 'successful' == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# timestamp features
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

# label encodings
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

# apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [2]:
# defining functions that will help us test our encidings
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)
    
    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')
    
    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
    
    param = {'num_leaves': 64,
             'objective': 'binary',
             'metric': 'auc',
             'seed': 100}
    print('Training model!')
    
    bst = lgb.train(param, dtrain, 
                    num_boost_round=1000, 
                    valid_sets=[dvalid],
                    early_stopping_rounds=100,
                    verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f'Validation AUC score: {valid_score:.4f}')
    
    return bst

In [3]:
# training a model on the baseline data
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7476


In [4]:
# count encoding
import category_encoders as ce

cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix('_count'))

# training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


Training model!
Validation AUC score: 0.7496


In [5]:
# target encoding
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7506


In [6]:
# catboost encoding
cat_features = ['category', 'currency', 'country']
target_enc = ce.CatBoostEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)
target_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7501


In [20]:
10000 * 1.2 ** 14

128391.84645488633