In [1]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train_set = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test_set = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")

train = train_set.copy()
test = test_set.copy()

train.drop("id",axis=1,inplace=True)
test.drop("id",axis=1,inplace=True)

train["sum"] = train.sum(axis=1)
test["sum"] = test.sum(axis=1)

X = np.array(train.drop("target",axis=1))
y = np.array(train["target"])

In [3]:
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

In [4]:
def objective(trial,X,y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=17)
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1 , 100),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 500 , 1000),
        'num_leaves' : trial.suggest_int('num_leaves' , 90 , 150), 
        'learning_rate' : trial.suggest_float('learning_rate' , 0.01 , 0.5),
        'max_depth' : trial.suggest_int('max_depth' , 2 , 5),               
        'n_estimators' : trial.suggest_int('n_estimators' , 1 ,50000),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 10),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-3 , 2),
        'subsample' : trial.suggest_float('subsample' , 0.008 , 1.0),
        'colsample_bytree' : trial.suggest_float('colsample_bytree' , 0.01 , 0.3)
    }
    lgb = LGBMClassifier(**params)  
    lgb.fit(X_train,y_train,eval_set=[(X_val,y_val)],eval_metric='multi_logloss',early_stopping_rounds=50, verbose=False)
        
    y_pred = lgb.predict_proba(X_val)
 
    log_loss_ = log_loss(y_val, y_pred)
    
    return log_loss_

In [5]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X,y), n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-05-12 21:35:54,990][0m A new study created in memory with name: no-name-ceeabfd0-f61c-4108-a491-38392bad3438[0m
[32m[I 2021-05-12 21:36:06,768][0m Trial 0 finished with value: 1.090273427219918 and parameters: {'reg_alpha': 6.245349642756345, 'reg_lambda': 713.589850670399, 'num_leaves': 146, 'learning_rate': 0.1916080179774014, 'max_depth': 4, 'n_estimators': 44545, 'min_child_samples': 3, 'min_child_weight': 0.33289209305073425, 'subsample': 0.5626551307256296, 'colsample_bytree': 0.24794875848531597}. Best is trial 0 with value: 1.090273427219918.[0m
[32m[I 2021-05-12 21:36:16,272][0m Trial 1 finished with value: 1.0897621424210153 and parameters: {'reg_alpha': 4.285653574168257, 'reg_lambda': 662.9164028707924, 'num_leaves': 92, 'learning_rate': 0.2321229218036525, 'max_depth': 4, 'n_estimators': 17800, 'min_child_samples': 7, 'min_child_weight': 0.2913343674549357, 'subsample': 0.38013668674821327, 'colsample_bytree': 0.14545385614962242}. Best is trial 1 with 

Number of finished trials: 30
Best trial: {'reg_alpha': 13.01086666100246, 'reg_lambda': 571.7035989396395, 'num_leaves': 99, 'learning_rate': 0.056298045256971074, 'max_depth': 2, 'n_estimators': 21144, 'min_child_samples': 4, 'min_child_weight': 0.006073248472791031, 'subsample': 0.751670734417424, 'colsample_bytree': 0.0771162423618313}


In [6]:
best_params = study.best_trial.params
best_params

{'reg_alpha': 13.01086666100246,
 'reg_lambda': 571.7035989396395,
 'num_leaves': 99,
 'learning_rate': 0.056298045256971074,
 'max_depth': 2,
 'n_estimators': 21144,
 'min_child_samples': 4,
 'min_child_weight': 0.006073248472791031,
 'subsample': 0.751670734417424,
 'colsample_bytree': 0.0771162423618313}

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=21)

In [8]:
lgb = LGBMClassifier(**best_params)
lgb.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.0771162423618313,
               learning_rate=0.056298045256971074, max_depth=2,
               min_child_samples=4, min_child_weight=0.006073248472791031,
               n_estimators=21144, num_leaves=99, reg_alpha=13.01086666100246,
               reg_lambda=571.7035989396395, subsample=0.751670734417424)

In [9]:
y_pred = lgb.predict_proba(X_val)
loss = log_loss(y_val, y_pred)
loss

1.090413456455932

In [10]:
y_test = lgb.predict_proba(test)
y_test

array([[0.09247862, 0.61992633, 0.1632069 , 0.12438814],
       [0.08322425, 0.70464945, 0.12515889, 0.0869674 ],
       [0.08177066, 0.67585777, 0.17069155, 0.07168002],
       ...,
       [0.08092375, 0.52690999, 0.25121818, 0.14094808],
       [0.08722782, 0.61477194, 0.16006077, 0.13793947],
       [0.08421565, 0.57416195, 0.20019633, 0.14142607]])

In [11]:
result = pd.DataFrame(index=test_set["id"], data={"Class_1": y_test[:,0],"Class_2": y_test[:,1],"Class_3": y_test[:,2],"Class_4": y_test[:,3]})
result.head(10)

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100000,0.092479,0.619926,0.163207,0.124388
100001,0.083224,0.704649,0.125159,0.086967
100002,0.081771,0.675858,0.170692,0.07168
100003,0.065461,0.5606,0.30577,0.068168
100004,0.066181,0.654118,0.16993,0.109771
100005,0.087108,0.392155,0.398381,0.122356
100006,0.095958,0.346757,0.433673,0.123612
100007,0.10308,0.729298,0.105451,0.062171
100008,0.092141,0.553606,0.232484,0.121769
100009,0.084497,0.564369,0.205189,0.145945


In [12]:
sub = result.to_csv("submission_.csv")