In [12]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
import dill

In [16]:
train = pd.read_csv('../../data/train_prep.csv')

In [17]:
test = pd.read_csv('../../data/test_prep.csv')

In [18]:
train.columns.tolist()

['content',
 'service',
 'priority',
 'status',
 'group',
 'dt_deadline',
 'dt_query',
 'type_query',
 'type_final',
 'solution',
 'type_reclassification',
 'dt_recovery',
 'dt_close',
 'criticality',
 'impact',
 'system',
 'place']

In [19]:
feats_cat = [
    'service',
    'priority',
    'status',
    'group',
    'type_query',
    'criticality',
    'impact',
    'system',
    'place'
]

In [20]:
train_features = feats_cat

In [21]:
with open("../../configs/features.json", "w") as stream:
    features = {
        "all_feat": train_features,
        "feats_numeric": [],
        "feats_cat": feats_cat
    }
    json.dump(features, stream, indent=4)

In [22]:
target = "type_reclassification"

In [23]:
metric = lambda *args: f1_score(*args, average="macro")

In [24]:
train = train.reset_index().rename(columns={"index": "ID"})

#### prepare features

In [31]:
for feat in feats_cat:
    le = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
    train[feat] = le.fit_transform(train[feat].values.reshape(-1, 1))
    with open(f"../../objects/encoders/{feat}_enc.dill", "wb") as stream:
        dill.dump(le, stream)

### baseline solution

In [32]:
model_params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_leaves": 32,
    "max_depth": 5,
    "learning_rate": 0.05,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 1,
    "min_child_samples": 300,
    "n_jobs": 10,
    "n_estimators": 1000,
    "num_class": 3,
}

In [35]:
model_folder = "../../models"

In [36]:
model_name = "baseline"

In [41]:
def base_cv(
    train,
    target,
    train_features,
    cat_features=None,
    random_state=42,
    n_folds=5,
    model_folder="models",
    model_name="model",
    model_params=None
):
    cat_feats_ind = [i for i, j in enumerate(train_features) if j in cat_features]
    model_params["categorical_column"] = cat_feats_ind
    skf = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    preds = []
    scores = []
    for i, (train_idx, test_idx) in enumerate(skf.split(train[train_features], train[target])):
        X_train, y_train = train[train_features].iloc[train_idx], train[target].iloc[train_idx]
        X_test, y_test = train[train_features].iloc[test_idx], train[target].iloc[test_idx]
        model = lgb.LGBMModel(**model_params)
        e_stop = round(5 / model.get_params()['learning_rate'])
        model.fit(
            X_train,
            y_train,
            eval_set=(X_test, y_test), 
            early_stopping_rounds=e_stop,
            eval_metric=model.metric,
            verbose=False
        )
        model.booster_.save_model(os.path.join(model_folder, model_name, f"fold_{i}.model",))
        fold_preds = model.predict(X_test)
        fold_labels = np.argmax(fold_preds, axis=1)
        fold_score = metric(y_test, fold_labels)
        preds.append(fold_preds)
        scores.append(fold_score)
        print(
            i,
            "it:", model.best_iteration_,
            "score:", fold_score
        )
    return preds, scores

In [56]:
class CVModel():
    import numpy as np
    def __init__(self, models_folder, num_class=3):
        from os import listdir
        import lightgbm
        model_files = [f for f in listdir(models_folder) if '.model' in f]

        self.models = []
        for model_file in model_files:             
            self.models.append(lightgbm.Booster(model_file=os.path.join(models_folder, model_file), params={'n_jobs':1}))
        self.num_class = num_class

    def predict(self, Y):
        import numpy as np
        import pandas as pd
        prediction = np.zeros((Y.shape[0], self.num_class))
        for model in self.models:

            if(isinstance(Y, pd.DataFrame)):
                prediction += model.predict(Y[model.feature_name()])
            else:
                prediction += model.predict(Y)

        return prediction / len(self.models)
    
    def predict_labels(self, Y):
        predictions = self.predict(Y)
        labels = np.argmax(predictions, axis=1)
        
        return labels

In [43]:
preds, scores = base_cv(
    train,
    target,
    train_features,
    cat_features=feats_cat,
    model_folder=model_folder,
    model_name="baseline",
    model_params=model_params
)



0 it: 182 score: 0.8523727707045516




1 it: 168 score: 0.815735984962676




2 it: 91 score: 0.7887840188254555




3 it: 171 score: 0.8623136861416597




4 it: 140 score: 0.7405696336881032


In [44]:
np.mean(scores)

0.8119552188644892

In [57]:
baseline_model = CVModel(os.path.join(model_folder, model_name))

In [58]:
os.path.join(model_folder, model_name)

'../../models/baseline'

In [59]:
baseline_model.models

[<lightgbm.basic.Booster at 0x7fff9d1cae80>,
 <lightgbm.basic.Booster at 0x7fff9d186c40>,
 <lightgbm.basic.Booster at 0x7fff9d733cd0>,
 <lightgbm.basic.Booster at 0x7fff982ca7c0>,
 <lightgbm.basic.Booster at 0x7fff9d1860d0>]

In [62]:
with open("../../models/baseline/baseline_model.dill", "wb") as stream:
    dill.dump(baseline_model, stream)