In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Import Local funcs
import sys
sys.path.append('../')
from utils.utils import *


# 01 - Drop Some features

In [2]:
X, y = get_training()

In [3]:
# Dummy encode categorical columns
categoricals = X.select_dtypes(include='object').columns
for col in categoricals:
    X[col] = X[col].astype('category').cat.codes


In [4]:
correlated_features = set()
correlation_matrix = X.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.99:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [5]:
# Correlated features to drop
correlated_fts = list(correlated_features)
correlated_fts

['v20b',
 'v278c_r',
 'v263_ISCED_2b',
 'v252_ISCED97',
 'v243_EISCED',
 'v243_edulvlb_2',
 'v252_cs_GB2',
 'v243_ISCED_2b',
 'v262_ISCED_3',
 'v279c_r',
 'v263_8cat',
 'v252_EISCED',
 'v243_ISCED_3',
 'v263_edulvlb_2',
 'v263_EISCED',
 'v262_8cat',
 'v263_ISCED_3',
 'v275c_N1',
 'v252_edulvlb_2',
 'v243_ISCED_1',
 'v252_ISCED_3',
 'v262_ISCED_2b',
 'v243_edulvlb_1',
 'f46_IT',
 'v262_edulvlb_2',
 'v262_EISCED',
 'v243_cs_GB2',
 'v252_ISCED_2b']

In [6]:
# Other features to drop (MANUALLY SELECTED)
other_fts = ['v226', 'v242', 'v261', 'v277', 'v279a', 'v279b', 'v52',
             'v282']

In [7]:
# Aggreagated features to drop
cols_to_drop = correlated_fts + other_fts

In [8]:
# Drop
X_dropped = X.drop(cols_to_drop, axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_dropped, y, test_size = 0.3, random_state = 42)

In [10]:
d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test, reference=d_train)

In [11]:
# Train Model
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",

    "num_leaves": 27,
    # "max_depth": 5,
    "learning_rate": 0.01, # Retry with 0.05, 0.1, 
    #"n_estimators": 1000,
    #"subsample_for_bin": 200000,
    #"class_weight"='balanced',
    "min_child_samples": 50,
    "reg_alpha": 8.6234,
    "reg_lambda": 1.9843,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.99,
    'bagging_freq': 1, 
    "feature_pre_filter": False,               
    
    "boost_from_average": True,
    "num_class" : 5,
    "verbose": 1,
}

model = lgb.train(
    params,
    d_train,
    num_boost_round=10000,
    valid_sets=d_test,
    callbacks=[lgb.early_stopping(100)],
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7399
[LightGBM] [Info] Number of data points in the train set: 33600, number of used features: 401
[LightGBM] [Info] Start training from score -4.282397
[LightGBM] [Info] Start training from score -1.579677
[LightGBM] [Info] Start training from score -1.216351
[LightGBM] [Info] Start training from score -1.261657
[LightGBM] [Info] Start training from score -1.606169
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2809]	valid_0's multi_logloss: 0.835274


# Check for CV Score

In [12]:
# CV on whole dataset
dtrain_cv = lgb.Dataset(X_dropped, label=y)

cv_results = lgb.cv(
    params,
    dtrain_cv,
    num_boost_round=10000,
    nfold=3,
    stratified=False,
    #categorical_feature = indexes_of_categories,
    callbacks=[lgb.early_stopping(50)],
)

print(f'CV: Multi Log Loss: {cv_results["multi_logloss-mean"][-1] :.5f}')
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-stdv"][-1] :.5f}')

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7460
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 401
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7460
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 401
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7460
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 401
[LightGBM] [Info] Start training from score -4.264244
[LightGBM] [Info] Start training from score -1.566917
[LightGBM] [Info] Start training from score -1.219510
[LightGBM] [Info] Start training from score -1.267068
[LightGBM] [Info] Start training f

Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fe632780af0>
Traceback (most recent call last):
  File "/Users/zaza/miniconda3/lib/python3.10/site-packages/lightgbm/basic.py", line 91, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


# Encode the categories as categorical in LGBM

In [2]:
X, y = get_training()

In [3]:
categoricals = X.select_dtypes('object').columns
for col in categoricals:
    X[col] = X[col].astype('category').cat.codes
indexes_of_categories = [X.columns.get_loc(col) for col in categoricals]

# Get column names that strat with 'v'
v_cols = [col for col in X.columns if col.startswith('v')]

In [4]:
indexes_of_categories_v = [X.columns.get_loc(col) for col in v_cols]

In [5]:
all_cats_indexes = list(set(indexes_of_categories + indexes_of_categories_v))
len(all_cats_indexes)

418

In [6]:
# CV on whole dataset

params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",

    "num_leaves": 10,
    # "max_depth": 5,
    "learning_rate": 0.01,
    #"n_estimators": 1000,
    #"subsample_for_bin": 200000,
    #"class_weight"='balanced',
    #"min_child_samples": 20,
    #"reg_alpha": 0.1,
    #"reg_lambda": 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,                
    
    "boost_from_average": True,
    "num_class" : 5,
    "verbose": 1,
}

dtrain_cv = lgb.Dataset(X, label=y)

cv_results = lgb.cv(
    params,
    dtrain_cv,
    num_boost_round=10000,
    nfold=3,
    stratified=False,
    categorical_feature = all_cats_indexes,
    callbacks=[lgb.early_stopping(100)],
)

print("\n")
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-mean"][-1] :.5f}')
print(f'CV: Multi Log Loss STD: {cv_results["multi_logloss-stdv"][-1] :.5f}')

New categorical_feature is [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 140, 141, 143, 144, 145, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,

e value in categorical features, will convert it to NaN

 Met negative value in categorical features, will convert it to NaN
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10526
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 428
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10526
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 428
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10526
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 428
[LightGBM] [Info] Start training from score -4.264244
[LightGBM] [Info] Start training from score -1.566917
[LightGBM] [In

# Submision


# Stack

In [None]:
import joblib


In [None]:
joblib.dump(final_model, 'LGBM_model.pkl')

['LGBM_model.pkl']