# Libraries

In [1]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train_set = pd.read_csv("../input/tabular-playground-series-jun-2021/train.csv")
test_set = pd.read_csv("../input/tabular-playground-series-jun-2021/test.csv")

train = train_set.copy()
test = test_set.copy()

train.drop("id",axis=1,inplace=True)
test.drop("id",axis=1,inplace=True)

train["sum"] = train.sum(axis=1)
test["sum"] = test.sum(axis=1)

X = np.array(train.drop("target",axis=1))
y = np.array(train["target"])
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

# Defining Model

In [3]:
def objective(trial,X,y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=17)
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1 , 100),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 500 , 1000),
        'num_leaves' : trial.suggest_int('num_leaves' , 90 , 150), 
        'learning_rate' : trial.suggest_float('learning_rate' , 0.01 , 0.5),
        'max_depth' : trial.suggest_int('max_depth' , 2 , 5),               
        'n_estimators' : trial.suggest_int('n_estimators' , 1 ,50000),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 10),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-3 , 2),
        'subsample' : trial.suggest_float('subsample' , 0.008 , 1.0),
        'colsample_bytree' : trial.suggest_float('colsample_bytree' , 0.01 , 0.3)
    }
    lgb = LGBMClassifier(**params)  
    lgb.fit(X_train,y_train,eval_set=[(X_val,y_val)],eval_metric='multi_logloss',early_stopping_rounds=50, verbose=False)
        
    y_pred = lgb.predict_proba(X_val)
 
    log_loss_ = log_loss(y_val, y_pred)
    
    return log_loss_

# Optuna

In [4]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X,y), n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-06-02 18:34:58,849][0m A new study created in memory with name: no-name-42b317a3-1ee5-4a72-b90d-c0f8dc4e1e36[0m
[32m[I 2021-06-02 18:35:23,409][0m Trial 0 finished with value: 1.7430048147978034 and parameters: {'reg_alpha': 29.956194265766857, 'reg_lambda': 943.8232016182603, 'num_leaves': 142, 'learning_rate': 0.36479818492146915, 'max_depth': 5, 'n_estimators': 13856, 'min_child_samples': 10, 'min_child_weight': 0.5437237107924228, 'subsample': 0.5047597067332052, 'colsample_bytree': 0.15291587793848427}. Best is trial 0 with value: 1.7430048147978034.[0m
[32m[I 2021-06-02 18:35:58,215][0m Trial 1 finished with value: 1.7428133692169856 and parameters: {'reg_alpha': 3.059330902415478, 'reg_lambda': 597.5602343386747, 'num_leaves': 130, 'learning_rate': 0.20635023151564869, 'max_depth': 2, 'n_estimators': 23159, 'min_child_samples': 1, 'min_child_weight': 0.0296888506460925, 'subsample': 0.3250969033933623, 'colsample_bytree': 0.0624182435053279}. Best is trial 1 

Number of finished trials: 30
Best trial: {'reg_alpha': 14.066386297492933, 'reg_lambda': 782.6513660266065, 'num_leaves': 110, 'learning_rate': 0.08091591904311343, 'max_depth': 5, 'n_estimators': 22091, 'min_child_samples': 2, 'min_child_weight': 0.014167803972043104, 'subsample': 0.9981917429044114, 'colsample_bytree': 0.08909643303979979}


In [5]:
best_params = study.best_trial.params
best_params

{'reg_alpha': 14.066386297492933,
 'reg_lambda': 782.6513660266065,
 'num_leaves': 110,
 'learning_rate': 0.08091591904311343,
 'max_depth': 5,
 'n_estimators': 22091,
 'min_child_samples': 2,
 'min_child_weight': 0.014167803972043104,
 'subsample': 0.9981917429044114,
 'colsample_bytree': 0.08909643303979979}

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=17)

In [7]:
lgb = LGBMClassifier(**best_params)
lgb.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.08909643303979979,
               learning_rate=0.08091591904311343, max_depth=5,
               min_child_samples=2, min_child_weight=0.014167803972043104,
               n_estimators=22091, num_leaves=110, reg_alpha=14.066386297492933,
               reg_lambda=782.6513660266065, subsample=0.9981917429044114)

In [8]:
y_pred = lgb.predict_proba(X_val)
loss = log_loss(y_val, y_pred)
loss

1.7459418111813605

In [9]:
y_test = lgb.predict_proba(test)
y_test

array([[0.07170752, 0.40732525, 0.15684798, ..., 0.02112199, 0.03381467,
        0.11468468],
       [0.03918568, 0.08120554, 0.06402575, ..., 0.09727472, 0.34078973,
        0.11627944],
       [0.01963153, 0.03393188, 0.02818354, ..., 0.02797627, 0.09937416,
        0.06457668],
       ...,
       [0.08806375, 0.29096711, 0.1321415 , ..., 0.0367138 , 0.10852454,
        0.13111658],
       [0.04325971, 0.01918469, 0.01363984, ..., 0.07392011, 0.36354676,
        0.10702745],
       [0.05210888, 0.05773908, 0.06166756, ..., 0.09269002, 0.2630184 ,
        0.15562036]])

In [10]:
result = pd.DataFrame(index=test_set["id"], data={"Class_1": y_test[:,0],"Class_2": y_test[:,1],"Class_3": y_test[:,2],"Class_4": y_test[:,3],"Class_5": y_test[:,4],
                                                 "Class_6": y_test[:,5],"Class_7": y_test[:,6],"Class_8": y_test[:,7],"Class_9": y_test[:,8]})
result.head()

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
200000,0.071708,0.407325,0.156848,0.028704,0.01309,0.152704,0.021122,0.033815,0.114685
200001,0.039186,0.081206,0.064026,0.011419,0.01082,0.239,0.097275,0.34079,0.116279
200002,0.019632,0.033932,0.028184,0.010671,0.005926,0.709729,0.027976,0.099374,0.064577
200003,0.046524,0.135671,0.082438,0.056164,0.016266,0.190736,0.08279,0.205534,0.183877
200004,0.04619,0.10818,0.093111,0.032607,0.015512,0.217985,0.06199,0.282162,0.142263


In [11]:
sub = result.to_csv("sub.csv")