In [None]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
import dill
import optuna

In [None]:
import sys
sys.path.append("../..")

In [None]:
from src.utils.transforms import rename_columns

In [None]:
train = pd.read_csv('../../data/csv/train.csv')

In [None]:
test = pd.read_csv('../../data/csv/test.csv')

In [None]:
train = rename_columns(train)

In [None]:
test = rename_columns(test)

In [None]:
train.columns.tolist()

In [None]:
feats_cat = [
    'service',
    'priority',
    'status',
    'group',
    'type_query',
    'criticality',
    'impact',
    'system',
    'place'
]

In [None]:
train_features = feats_cat

In [None]:
with open("../../configs/features.json", "w") as stream:
    features = {
        "all_feat": train_features,
        "feats_numeric": [],
        "feats_cat": feats_cat
    }
    json.dump(features, stream, indent=4)

In [None]:
target = "type_reclassification"

In [None]:
metric = lambda *args: f1_score(*args, average="macro")

In [None]:
train = train.reset_index().rename(columns={"index": "ID"})

#### prepare features

In [None]:
for feat in feats_cat:
    le = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
    train[feat] = le.fit_transform(train[feat].values.reshape(-1, 1))
    with open(f"../../objects/encoders/{feat}_enc.dill", "wb") as stream:
        dill.dump(le, stream)

### baseline solution

In [None]:
model_params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_leaves": 32,
    "max_depth": 5,
    "learning_rate": 0.05,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 1,
    "min_child_samples": 300,
    "n_jobs": 10,
    "n_estimators": 1000,
    "num_class": 3,
}

In [None]:
model_folder = "../../models"

In [None]:
model_name = "baseline"

In [None]:
def base_cv(
    train,
    target,
    train_features,
    cat_features=None,
    random_state=42,
    n_folds=5,
    model_folder="models",
    model_name="model",
    model_params=None,
    verbose=True
):
    cat_feats_ind = [i for i, j in enumerate(train_features) if j in cat_features]
    model_params["categorical_column"] = cat_feats_ind
    skf = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    preds = []
    scores = []
    for i, (train_idx, test_idx) in enumerate(skf.split(train[train_features], train[target])):
        X_train, y_train = train[train_features].iloc[train_idx], train[target].iloc[train_idx]
        X_test, y_test = train[train_features].iloc[test_idx], train[target].iloc[test_idx]
        model = lgb.LGBMModel(**model_params)
        e_stop = round(5 / model.get_params()['learning_rate'])
        model.fit(
            X_train,
            y_train,
            eval_set=(X_test, y_test), 
            early_stopping_rounds=e_stop,
            eval_metric=model.metric,
            verbose=False
        )
        if model_name is not None:
            model.booster_.save_model(os.path.join(model_folder, model_name, f"fold_{i}.model",))
        fold_preds = model.predict(X_test)
        fold_labels = np.argmax(fold_preds, axis=1)
        fold_score = metric(y_test, fold_labels)
        preds.append(fold_preds)
        scores.append(fold_score)
        if verbose:
            print(
                i,
                "it:", model.best_iteration_,
                "score:", fold_score
            )
    return preds, scores

In [None]:
class CVModel():
    import numpy as np
    def __init__(self, models_folder, num_class=3):
        from os import listdir
        import lightgbm
        model_files = [f for f in listdir(models_folder) if '.model' in f]

        self.models = []
        for model_file in model_files:             
            self.models.append(lightgbm.Booster(model_file=os.path.join(models_folder, model_file), params={'n_jobs':1}))
        self.num_class = num_class

    def predict(self, Y):
        import numpy as np
        import pandas as pd
        prediction = np.zeros((Y.shape[0], self.num_class))
        for model in self.models:

            if(isinstance(Y, pd.DataFrame)):
                prediction += model.predict(Y[model.feature_name()])
            else:
                prediction += model.predict(Y)

        return prediction / len(self.models)
    
    def predict_labels(self, Y):
        predictions = self.predict(Y)
        labels = np.argmax(predictions, axis=1)
        
        return labels

In [None]:
def objective(trial):
    model_params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",                
        "seed": 42,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        "num_class": 3
    }
    preds, scores = base_cv(
        train,
        target,
        train_features,
        cat_features=feats_cat,
        model_name=None,
        model_params=model_params,
        verbose=False
    )
    return np.mean(scores) - np.std(scores)

#### Optuna

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

In [147]:
study.best_value

0.7829261550269685

### Save model

In [146]:
model_params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",                
        "seed": 42,
        'lambda_l1': 0,
        'lambda_l2': 0.002,
        'num_leaves': 154,
        'feature_fraction': 0.66,
        'bagging_fraction': 0.95,
        'bagging_freq': 0,
        'min_child_samples': 11,
        "num_class": 3,
    }

In [None]:
model_folder = "../../models/"

In [None]:
model_name = "lightgbm_classifier"

In [148]:
preds, scores = base_cv(
    train,
    target,
    train_features,
    cat_features=feats_cat,
    model_folder=model_folder,
    model_name=model_name,
    model_params=model_params,
    n_folds=5
)



0 it: 53 score: 0.878370155485197




1 it: 64 score: 0.832846924720776




2 it: 36 score: 0.78743789290907




3 it: 54 score: 0.8733058249128277




4 it: 35 score: 0.7515519612516507


In [149]:
np.mean(scores)

0.8247025518559041

In [150]:
np.std(scores)

0.04908866336068601

In [151]:
ligthgbm_classifier = CVModel(os.path.join(model_folder, model_name))

In [153]:
with open(os.path.join(model_folder, model_name, f"{model_name}.dill"), "wb") as stream:
    dill.dump(ligthgbm_classifier, stream)

In [None]:
baseline_model.models