In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# Import Local funcs
import sys
sys.path.append('../')
from utils.utils import *

# Model 2

In [2]:
X, y = get_training()
obj_col = X.select_dtypes(include=['object']).columns
obj_col_idx = [X.columns.get_loc(c) for c in obj_col]
# Dummy encode the object type data
for col in obj_col:
    X[col] = X[col].astype('category').cat.codes

dtrain = lgb.Dataset(X, label=y)

In [5]:
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",

    "num_leaves": 31,
    # "max_depth": 5,
    #"learning_rate": 0.01, # Retry with 0.05, 0.1, 
    #"n_estimators": 1000,
    #"subsample_for_bin": 200000,
    #"class_weight"='balanced',
    "min_child_samples": 20,
    "reg_alpha": 8.82,
    "reg_lambda": 6.67,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.92,
    'bagging_freq': 2, 
    "feature_pre_filter": False,               
    
    "boost_from_average": True,
    "num_class" : 5,
    "verbose": 1,
}

cv_results = lgb.cv(
    params,
    dtrain,
    num_boost_round=10000,
    categorical_feature=obj_col_idx,
    nfold=5,
    stratified=False, # try with true
    callbacks=[lgb.early_stopping(100),
               lgb.reset_parameter(learning_rate =  [0.05]*10 + [0.03]*90+ [0.01]*9900)],
               seed = 42,
)

New categorical_feature is [4, 290, 294, 297, 340, 420, 421, 434]


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9282
[LightGBM] [Info] Number of data points in the train set: 38400, number of used features: 437
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9282
[LightGBM] [Info] Number of data points in the train set: 38400, number of used features: 437
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9282
[LightGBM] [Info] Number of data points in the train set: 38400, number of used features: 437
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9282
[LightGBM] [Info] Number of data points in the train set: 38400, number of used features: 437


In [6]:
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-mean"][-1] :.5f}')
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-stdv"][-1] :.5f}')
print(f'Best num_boost_rounds: {cv_results["multi_logloss-mean"]}')

CV: Multi Log Loss: 0.83553
CV: Multi Log Loss: 0.01094
Best num_boost_rounds: [1.389816220234008, 1.3553096574858592, 1.3210118085641807, 1.2924126023295848, 1.2664056240660617, 1.2420193700241984, 1.2190976138383687, 1.197335539264492, 1.1771361993789853, 1.1597749082909528, 1.1496549730910046, 1.1384164711245919, 1.1290637975014153, 1.1197855713746168, 1.1105434675013355, 1.102414526636417, 1.0951201933599393, 1.0875670971268234, 1.0805565674831112, 1.0737696115943638, 1.0677560036017764, 1.0612856486665838, 1.055491889331177, 1.0489431047465854, 1.0427284546680868, 1.0371601902040568, 1.0312106110661128, 1.0258637597946962, 1.020750439671581, 1.0155386595378952, 1.010250444453435, 1.0057176619808934, 1.0013776728234034, 0.9969096167554234, 0.9927174942118049, 0.9881955451213242, 0.9843180597151505, 0.9806139362239582, 0.9766275360088998, 0.973688343871704, 0.9705530849287325, 0.9675436310822523, 0.96434000443716, 0.9609394611038364, 0.9582420323762971, 0.9556919307166243, 0.9527883

# Model 3

In [3]:
correlated_features = set()
correlation_matrix = X.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.99:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [4]:
X_dropped = X.drop(list(correlated_features), axis=1)
d_train_dropped = lgb.Dataset(X_dropped, label=y)

In [26]:
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",

    "num_leaves": 27,
    # "max_depth": 5,
    "learning_rate": 0.01, # Retry with 0.05, 0.1, 
    #"n_estimators": 1000,
    #"subsample_for_bin": 200000,
    #"class_weight"='balanced',
    "min_child_samples": 50,
    "reg_alpha": 8.5,
    "reg_lambda": 2,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.92,
    'bagging_freq': 1, 
    "feature_pre_filter": False,               
    
    "boost_from_average": True,
    "num_class" : 5,
    "verbose": 1,
}

cv_results = lgb.cv(
    params,
    d_train_dropped,
    num_boost_round=10000,
    #categorical_feature=obj_col_idx,
    nfold=3,
    stratified=False, # try with true
    callbacks=[lgb.early_stopping(100),],
               #lgb.reset_parameter(learning_rate =  [0.05]*30 + [0.03]*470+ [0.01]*9500)],
               seed = 42,
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8219
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 409
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8219
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 409
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8219
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 409
[LightGBM] [Info] Start training from score -4.300447
[LightGBM] [Info] Start training from score -1.580183
[LightGBM] [Info] Start training from score -1.213602
[LightGBM] [Info] Start training from score -1.263524
[LightGBM] [Info] Start training f

In [7]:
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-mean"][-1] :.5f}')
print(f'CV: Multi Log Loss: {cv_results["multi_logloss-stdv"][-1] :.5f}')
print(f'Best num_boost_rounds: {len(cv_results["multi_logloss-mean"])}')

CV: Multi Log Loss: 0.83227
CV: Multi Log Loss: 0.00552
Best num_boost_rounds: 1570


In [8]:
# Submission

In [10]:
final_model = lgb.LGBMClassifier(**params, n_estimators=1570)


In [11]:
X_full,y_full = get_training()
obj_col_full = X_full.select_dtypes(include=['object']).columns

for col in obj_col_full:
    X_full[col] = X_full[col].astype('category').cat.codes
X_full_dropped = X_full.drop(list(correlated_features), axis=1)



In [18]:
final_model.fit(X_full_dropped, y_full)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8219
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 409
[LightGBM] [Info] Start training from score -4.274668
[LightGBM] [Info] Start training from score -1.566917
[LightGBM] [Info] Start training from score -1.216200
[LightGBM] [Info] Start training from score -1.270291
[LightGBM] [Info] Start training from score -1.607981


In [20]:
X_test = get_test()
for col in obj_col_full:
    X_test[col] = X_test[col].astype('category').cat.codes
X_test_dropped = X_test.drop(list(correlated_features), axis=1)
preds_df = get_predictions(X_test_dropped,final_model, proba=True)

In [23]:
preds_df.to_csv(f"../data/predictions/preds7_FineTune_ObjEncoded.csv")

In [24]:
final_model.booster_.save_model(f"model_bin/model6_FineTune_ObjEncoded.txt")

<lightgbm.basic.Booster at 0x7fb9a0fcb130>

In [36]:
no_ans_idx = preds_df[preds_df["no answer"] > 0.6].index

In [39]:
no_ans_df = X.iloc[no_ans_idx]

In [45]:
no_ans_df

Unnamed: 0_level_0,year,fw_start,fw_end,country,c_abrv,v1,v2,v3,v4,v5,...,v278b,v278c_r,v279a,v279b,v279c_r,v279d_r,v280,v281a,v281a_r,v282
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
791,2018,201808,201812,578,25,1,1,2,2,2,...,1,12.01,13,33,13.33,92,1,24,200,1018
1119,2019,201812,201903,807,23,1,1,1,1,3,...,15,17.15,18,3,18.03,48,2,22,279,38
2320,2017,201709,201712,705,31,1,1,2,2,2,...,21,15.21,16,8,16.08,47,2,30,416,38
2675,2018,201801,201803,268,16,1,1,2,3,3,...,45,12.45,13,49,13.49,64,2,19,157,151
3189,2017,201709,201710,528,24,3,1,1,1,2,...,23,11.23,11,46,11.46,23,1,23,120,-4
3250,2017,201710,201802,191,17,3,1,1,2,2,...,48,18.48,19,26,19.26,38,2,14,90,15456
3555,2018,201809,201901,380,20,1,1,2,2,1,...,47,12.47,13,57,13.57,70,1,18,207,573576
4217,2018,201802,201807,826,15,2,1,1,2,3,...,5,14.05,14,46,14.46,41,2,8,128,3890
4471,2018,201801,201803,268,16,1,1,1,2,2,...,13,12.13,13,21,13.21,68,2,19,157,161
5828,2021,202105,202110,428,22,1,1,2,4,4,...,47,11.47,12,50,12.5,63,1,28,380,4560


In [49]:
# all data except no answer idx
ans_df = X.drop(no_ans_idx)


In [54]:
# Compare v5 variables between the two dfs
print(no_ans_df["v63"].value_counts())

print(ans_df["v63"].value_counts())



v63
10    9
5     4
1     4
4     2
7     2
3     2
2     1
8     1
Name: count, dtype: int64
v63
 10    11731
 1      8390
 5      4786
 8      4785
 7      4087
 6      3196
 9      3046
 2      2464
 3      2425
 4      1820
-1       773
-2       472
Name: count, dtype: int64
