In [74]:
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy as sc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder  
from tqdm import tqdm
import scipy as sc
import scipy.stats as ss

# data directory
DATA_DIR = os.path.join('../..', 'data')
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [64]:
def prepare_data(x, y, test_size=0.2):
    objs = x.select_dtypes(include=['O'])
    objs = objs.columns.tolist()
    
    for col in objs:
        le = LabelEncoder()
        x[col] = le.fit_transform(list(x[col]))
                                  
    if test_size == 0:
        dtrain = x
        Y_train = y
        dtest = None
        Y_test = None
    else:
        dtrain, dtest, Y_train, Y_test = train_test_split(x, y, test_size=test_size, stratify=y, random_state=42)
    return dtrain, dtest, Y_train, Y_test


# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

# Cross Validate
def cross_validate(x_train, x_test, y_train, y_test, model):
    test_loss = None
    if x_test is not None:
        preds = model.predict(x_test)
        test_loss = log_loss(preds, y_test)

    preds_train = model.predict(x_train, num_iteration=model.best_iteration)
    train_loss = log_loss(preds_train, y_train)
    return train_loss, test_loss

# Lets filter out the columns with high correlation
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

def get_highly_correlated_columns(df):
    df_objs = df.select_dtypes(include=['O'])
    #del df_objs['country']
    corr_matrix = pd.DataFrame(columns=df_objs.columns, index=df_objs.columns.tolist())
    with tqdm(total=len(corr_matrix.columns.tolist()) * len(corr_matrix.columns.tolist())) as pbar:
        for col1 in df_objs.columns.tolist():
            for col2 in df_objs.columns.tolist():
                if col1 != col2:
                    confusion_matrix = pd.crosstab(df_objs[col1], df_objs[col2])
                    corr = cramers_corrected_stat(confusion_matrix)
                else:
                    corr = 1
                corr_matrix.loc[col1, col2] = corr
                pbar.update(1)
    
    cols = {}
    for c1 in corr_matrix.columns.tolist():
        s = corr_matrix.loc[c1]
        s = s[s > 0.5]
        s = list(s.index)
        s.remove(c1)
        cols[c1] = s

    cols = {k: cols[k] for k in cols if cols[k]}
    return cols

In [75]:
a_train.poor.fillna(False, inplace=True)
ay_train = np.ravel(a_train.poor.astype(int))
aX_train = a_train.drop(['poor', 'country'], axis=1)

b_train.poor.fillna(False, inplace=True)
by_train = np.ravel(b_train.poor.astype(int))
bX_train = b_train.drop(['poor', 'country'], axis=1)

c_train.poor.fillna(False, inplace=True)
cy_train = np.ravel(c_train.poor.astype(int))
cX_train = c_train.drop(['poor', 'country'], axis=1)

In [66]:
a_corr_cols = get_highly_correlated_columns(aX_train)
b_corr_cols = get_highly_correlated_columns(bX_train)
c_corr_cols = get_highly_correlated_columns(cX_train)

100%|██████████| 114921/114921 [10:48<00:00, 177.29it/s]
100%|██████████| 173889/173889 [15:26<00:00, 187.61it/s]
100%|██████████| 17424/17424 [01:34<00:00, 183.99it/s]


In [79]:
# Lets delete the highly correlated columns in a greedy manner
def delete_columns(df, dict_cols):
    to_delete = list(set([elem for values in dict_cols.values() for elem in values]))
    return df.drop(to_delete, axis=1)


a_train_small = delete_columns(aX_train, a_corr_cols)
b_train_small = delete_columns(bX_train, b_corr_cols)
c_train_small = delete_columns(cX_train, c_corr_cols)


print("Columns removed from A: {}".format(len(aX_train.columns) - len(a_train_small.columns)))
print("Columns removed from B: {}".format(len(bX_train.columns) - len(b_train_small.columns)))
print("Columns removed from C: {}".format(len(cX_train.columns) - len(c_train_small.columns)))

Columns removed from A: 103
Columns removed from B: 183
Columns removed from C: 75


In [80]:
test_size = 0.3
a_train, a_test, ay_train, ay_test = prepare_data(a_train_small, ay_train, test_size=test_size)
b_train, b_test, by_train, by_test = prepare_data(b_train_small, by_train, test_size=test_size)
c_train, c_test, cy_train, cy_test = prepare_data(c_train_small, cy_train, test_size=test_size)

test_size = 0.5
a_val, a_test, ay_val, ay_test = prepare_data(a_test, ay_test, test_size=test_size)
b_val, b_test, by_val, by_test = prepare_data(b_test, by_test, test_size=test_size)
c_val, c_test, cy_val, cy_test = prepare_data(c_test, cy_test, test_size=test_size)

In [84]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(a_train, ay_train)
lgb_eval = lgb.Dataset(a_val, ay_val, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'logloss'},#, 'l2'},
    'num_leaves': 500,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

categ = a_train_small.select_dtypes(include=['O'])
categ = categ.columns.tolist()

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=3000,
                valid_sets=lgb_eval,
                #early_stopping_rounds=100,
                #feature_name=a_train.columns.tolist(),
                categorical_feature=categ)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict
y_pred = gbm.predict(a_test, num_iteration=gbm.best_iteration)
y_pred_tr = gbm.predict(a_train, num_iteration=gbm.best_iteration)

# eval
print('The rmse of prediction is:', mean_squared_error(ay_test, y_pred) ** 0.5)
train_loss, test_loss = cross_validate(a_train, a_test, ay_train, ay_test, gbm)
print("A Loss. Train: {} - Test: {}".format(train_loss, test_loss))

Start training...




Save model...
Start predicting...
The rmse of prediction is: 0.3231831485298325
A Loss. Train: 3.550185593369918 - Test: 7.818836821953888
