In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
X_raw = pd.read_csv('../data/X_train.csv', index_col=0)
y = pd.read_csv('../data/y_train.csv', index_col=0)
y[y == -1] = 0

In [16]:
categoricals = X_raw.select_dtypes(include='object').columns
for col in categoricals:
    X_raw[col] = X_raw[col].astype('category').cat.codes


In [17]:
# Drop the list of correlated features
corr_fts = [
    'f46_IT',
    'v20b',
    'v243_EISCED',
    'v243_ISCED_1',
    'v243_ISCED_2b',
    'v243_ISCED_3',
    'v243_cs_GB2',
    'v243_edulvlb_1',
    'v243_edulvlb_2',
    'v252_EISCED',
    'v252_ISCED97',
    'v252_ISCED_2b',
    'v252_ISCED_3',
    'v252_cs_GB2',
    'v252_edulvlb_2',
    'v262_8cat',
    'v262_EISCED',
    'v262_ISCED_2b',
    'v262_ISCED_3',
    'v262_edulvlb_2',
    'v263_8cat',
    'v263_EISCED',
    'v263_ISCED_2b',
    'v263_ISCED_3',
    'v263_edulvlb_2',
    'v275c_N1',
    'v278c_r',
    'v279c_r'
]

X_raw_dropped = X_raw.drop(corr_fts, axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_raw_dropped, y, test_size = 0.3, random_state = 42)

In [19]:
d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test)

In [20]:
# Train Model
params = {
    "num_leaves": 10,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,                
    "learning_rate": 0.01,
    "objective": "multiclass",
    "boost_from_average": True,
    "early_stopping_round": 50,
    "num_class" : 5
}

model = lgb.train(
    params,
    d_train,
    10000,
    valid_sets=d_test,
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8139
[LightGBM] [Info] Number of data points in the train set: 33600, number of used features: 400
[LightGBM] [Info] Start training from score -4.282397
[LightGBM] [Info] Start training from score -1.579677
[LightGBM] [Info] Start training from score -1.216351
[LightGBM] [Info] Start training from score -1.261657
[LightGBM] [Info] Start training from score -1.606169
[1]	valid_0's multi_logloss: 1.41899
Training until validation scores don't improve for 50 rounds
[2]	valid_0's multi_logloss: 1.41022
[3]	valid_0's multi_logloss: 1.40213
[4]	valid_0's multi_logloss: 1.39415
[5]	valid_0's multi_logloss: 1.38665
[6]	valid_0's multi_logloss: 1.37939
[7]	valid_0's multi_logloss: 1.37173
[8]	valid_0's multi_logloss: 1.36436
[9]	valid_0's multi_logloss: 1.35721
[10]	valid_0's multi_logloss: 1.35016
[11]	valid_0's multi_logloss: 1.34345
[12]	valid_0's multi_logloss: 1.3368
[13]	valid_0's multi_logloss: 1.33047

# Submision


In [22]:
# Train on whole dataset
params_whole = {
    "num_leaves": 10,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,                
    "learning_rate": 0.01,
    "objective": "multiclass",
    "boost_from_average": True,
    'n_estimators': 2225,
    "num_class" : 5
}

final_model = lgb.LGBMClassifier(**params_whole)
final_model.fit(X_raw_dropped, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [23]:
# Load Test Dataset
X_submit = pd.read_csv('../data/X_test.csv', index_col=0)

# Convert categoricals to numericals
for col in categoricals:
    X_submit[col] = X_submit[col].astype('category').cat.codes

# Drop the list of correlated features
X_submit_dropped = X_submit.drop(corr_fts, axis=1)

In [28]:
predictions = final_model.predict_proba(X_submit_dropped)

In [30]:
preds_df = pd.DataFrame(predictions, columns=['no answer', 'very important', 'quite important', 'not important', 'not at all important'])
preds_df.index.name = 'id'
print(preds_df.shape)
preds_df.head()

(11438, 5)


Unnamed: 0_level_0,no answer,very important,quite important,not important,not at all important
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.001043,0.798278,0.119668,0.070115,0.010897
1,0.006289,0.086172,0.808093,0.093346,0.0061
2,0.008374,0.158345,0.51533,0.296341,0.02161
3,0.016722,0.093385,0.680334,0.170292,0.039267
4,0.004495,0.29844,0.625641,0.063032,0.008391


In [31]:
preds_df.to_csv('../data/predictions/predictions3.csv')