# Base stuff

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, hamming_loss, f1_score, ConfusionMatrixDisplay

import time
from tqdm.notebook import tqdm

# RecImpute Stuff

In [None]:
# Imports
%cd ~/recimpute
%load_ext autoreload
%autoreload 2

In [None]:
from Clustering.AbstractClustering import AbstractClustering
from Clustering.ShapeBasedClustering import ShapeBasedClustering
from Datasets.Dataset import Dataset
from Datasets.TrainingSet import TrainingSet
from Labeling.ImputationTechniques.ImputeBenchLabeler import ImputeBenchLabeler
from Labeling import AbstractLabeler
from FeaturesExtraction.TSFreshFeaturesExtractor import TSFreshFeaturesExtractor
from FeaturesExtraction.KiviatFeaturesExtractor import KiviatFeaturesExtractor
from FeaturesExtraction.KatsFeaturesExtractor import KatsFeaturesExtractor
from FeaturesExtraction.Catch22FeaturesExtractor import Catch22FeaturesExtractor
from FeaturesExtraction.TopologicalFeaturesExtractor import TopologicalFeaturesExtractor
from Utils.Utils import Utils

In [None]:
# Init obj
CLUSTERER = ShapeBasedClustering()
DATASETS = Dataset.instantiate_from_dir(CLUSTERER)
LABELER = ImputeBenchLabeler.get_instance()
LABELER_PROPERTIES = LABELER.get_default_properties()
FEATURES_EXTRACTORS_MAP = {'Topological': TopologicalFeaturesExtractor, 'Catch22': Catch22FeaturesExtractor}

FEATURES_EXTRACTORS = [fe.get_instance() for fe in FEATURES_EXTRACTORS_MAP.values()]
TRUE_LABELER_INFO = {}

training_set = TrainingSet(
    DATASETS, 
    CLUSTERER, 
    FEATURES_EXTRACTORS, 
    LABELER, LABELER_PROPERTIES,
    **TRUE_LABELER_INFO,
    force_generation=False,
)

In [None]:
# Load train and test data
all_train_info, labels_set = training_set._load(data_to_load='train')
all_test_info, _ = training_set._load(data_to_load='test')

In [None]:
# isolate train (next box = test) set features, i.e. remove non-feature info
# 2nd value should be same as above minus 3
train_features_df = all_train_info.iloc[:, ~all_train_info.columns.isin(['Data Set Name', 'Cluster ID', 'Label'])]
train_set_df = all_train_info.iloc[:, ~all_train_info.columns.isin(['Data Set Name', 'Cluster ID'])]

In [None]:
test_features_df = all_test_info.iloc[:, ~all_train_info.columns.isin(['Data Set Name', 'Cluster ID', 'Label'])]
test_set_df = all_test_info.iloc[:, ~all_train_info.columns.isin(['Data Set Name', 'Cluster ID'])]

# Construct train/test

In [None]:
#missing from all_list
labels_set.append('trmf')

#some classifiers can't handle strings as class names
class_to_index = {labels_set[i] : i for i in range(0, len(labels_set))}
index_to_class = {i : labels_set[i] for i in range(0, len(labels_set))}
def list_to_index(y_something):
    return np.array([class_to_index[lbl] for lbl in y_something])

def list_to_class(y_something):
    return np.array([labels_set[lbl] for lbl in y_something])

In [None]:
x_train, x_test = train_set_df.iloc[:, ~train_set_df.columns.isin(['Label'])].to_numpy(), test_set_df.iloc[:, ~test_set_df.columns.isin(['Label'])].to_numpy()
y_train, y_test = all_train_info['Label'].to_numpy(), all_test_info['Label'].to_numpy()

# Model selection

In [None]:
import ray
from ray import tune
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.tune.schedulers import ASHAScheduler
from ray.tune.tuner import Tuner, TuneConfig

import xgboost as xgb

In [None]:
# Extra pre-process
train_set_df.loc[:, 'Label'] = train_set_df.loc[:, 'Label'].map(class_to_index)
train_set_df['Label']

In [None]:
# Configure base class
trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        num_workers=2,
        use_gpu=False,
    ),
    label_column='Label',
    num_boost_round=20,
    params={
        # XGBoost specific params
        "objective": "multi:softmax",
        "num_class" : len(labels_set),
        "eval_metric": ["mlogloss"],#, "error"],
    },
    datasets={"train": ray.data.from_pandas(train_set_df)},
)

# Configure Tune
param_space = {
    "params":{
        "max_depth": tune.randint(4, 9),
        "learning_rate": tune.uniform(0.1, 0.9),
        "objective": "multi:softmax",
        "num_class" : len(labels_set),
        "eval_metric": ["mlogloss"]
    }
}
metric = "train-mlogloss"
server_cpu = 9 #leave 20%

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1)

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=server_cpu, metric=metric, mode="min"),
)

In [None]:
result_grid = tuner.fit()
best_result = result_grid.get_best_result()

In [None]:
# result
print("Best result:", best_result)
best_result.config

In [None]:
xgb_params = param_space['params']
xgb_params['max_depth'] = best_result.config['params']['max_depth']
xgb_params['learning_rate'] = best_result.config['params']['learning_rate']

dtrain = xgb.DMatrix(x_train, label=list_to_index(y_train))
dtest = xgb.DMatrix(x_test, missing=np.inf)
bst = xgb.train(xgb_params, dtrain, 20)
print('done')

In [None]:
y_pred = list_to_class(bst.predict(dtest).astype('int32'))# list_to_class()
[y_test.shape, y_pred.shape]


# Evaluation

In [None]:
def eval_data(y_test_some, y_pred_some):
    model_acc = accuracy_score(y_test_some, y_pred_some)
    model_prec = precision_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    model_recall = recall_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    model_f1 = f1_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    return [model_acc, model_prec, model_recall, model_f1]

In [None]:
# Per category
all_categories = set(Dataset.CONF['CATEGORIES'].values())
categories = ['Climate', 'Spectrogram Data', 'Power consumption', 'Water discharge', 'Motion', 'Medical']
index_dataset = test_features_df.shape[1] + 2
all_scores = []

for category in all_categories:
    y_test_cat = []
    y_pred_cat = []
    for i in range(0, len(y_test)):
        ds_name = all_test_info.iloc[i, index_dataset]
        if Dataset.CONF['CATEGORIES'][ds_name] == category:
            y_test_cat.append(y_test[i])
            y_pred_cat.append(y_pred[i])
    cat_scores = eval_data(y_test_cat, y_pred_cat)
    if (category in categories): print((category, cat_scores))
    all_scores.append(cat_scores)

In [None]:
#Global
print(np.array(all_scores).mean(0))