# Base stuff

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, hamming_loss, f1_score, ConfusionMatrixDisplay

import time
from tqdm.notebook import tqdm

# RecImpute Stuff

In [None]:
# Imports
%cd ~/recimpute
%load_ext autoreload
%autoreload 2

In [None]:
from Clustering.AbstractClustering import AbstractClustering
from Clustering.ShapeBasedClustering import ShapeBasedClustering
from Datasets.Dataset import Dataset
from Datasets.TrainingSet import TrainingSet
from Labeling.ImputationTechniques.ImputeBenchLabeler import ImputeBenchLabeler
from Labeling import AbstractLabeler
from FeaturesExtraction.TSFreshFeaturesExtractor import TSFreshFeaturesExtractor
from FeaturesExtraction.KiviatFeaturesExtractor import KiviatFeaturesExtractor
from FeaturesExtraction.KatsFeaturesExtractor import KatsFeaturesExtractor
from FeaturesExtraction.Catch22FeaturesExtractor import Catch22FeaturesExtractor
from FeaturesExtraction.TopologicalFeaturesExtractor import TopologicalFeaturesExtractor
from Utils.Utils import Utils

In [None]:
# Init obj
CLUSTERER = ShapeBasedClustering()
DATASETS = Dataset.instantiate_from_dir(CLUSTERER)
LABELER = ImputeBenchLabeler.get_instance()
LABELER_PROPERTIES = LABELER.get_default_properties()
FEATURES_EXTRACTORS_MAP = {'Topological': TopologicalFeaturesExtractor, 'Catch22': Catch22FeaturesExtractor}
list_complete = ['lgbm', 'rf', 'catboost', 'xgb_limitdepth', 'lrl1', 'lrl2']
FEATURES_EXTRACTORS = [fe.get_instance() for fe in FEATURES_EXTRACTORS_MAP.values()]
TRUE_LABELER_INFO = {}

training_set = TrainingSet(
    DATASETS, 
    CLUSTERER, 
    FEATURES_EXTRACTORS, 
    LABELER, LABELER_PROPERTIES,
    **TRUE_LABELER_INFO,
    force_generation=False,
)

In [None]:
# Load train and test data
all_train_info, labels_set = training_set._load(data_to_load='train')
all_test_info, _ = training_set._load(data_to_load='test')

In [None]:
# isolate train (next box = test) set features, i.e. remove non-feature info
# 2nd value should be same as above minus 3
train_features_df = all_train_info.iloc[:, ~all_train_info.columns.isin(['Data Set Name', 'Cluster ID', 'Label'])]

In [None]:
test_features_df = all_test_info.iloc[:, ~all_test_info.columns.isin(['Data Set Name', 'Cluster ID', 'Label'])]

# Construct train/test

In [None]:
#missing from all_list
labels_set.append('trmf')

#some classifiers can't handle strings as class names
class_to_index = {labels_set[i] : i for i in range(0, len(labels_set))}
def list_to_index(y_something):
    return np.array([class_to_index[lbl] for lbl in y_something])

def list_to_class(y_something):
    return np.array([labels_set[lbl] for lbl in y_something])

In [None]:
x_train, x_test = train_features_df.to_numpy(), test_features_df.to_numpy()
y_train, y_test = list_to_index(all_train_info['Label'].to_numpy()), all_test_info['Label'].to_numpy()
#[!] only y_train is using the conversion, not y_test
x_test = np.array([np.array([x if np.isfinite(x) else 0.0 for x in line]) for line in x_test])

# Model selection

In [None]:
# import
from flaml import AutoML

In [None]:
automl = AutoML()
sufficient_time = 6025 #recommended budget
automl.fit(x_train, y_train, task="classification", estimator_list=list_complete, time_budget=sufficient_time)

In [None]:
# import and construct the best classifier from the output above
classifier = automl.model
classifier

In [None]:
# run whatever is constructed above, output should match the constructor in the last box 
classifier.fit(x_train, y_train)

In [None]:
y_pred = list_to_class([x if np.isscalar(x) else x.item() for x in classifier.predict(x_test).astype('int32')])
[y_test.shape, y_pred.shape]

# Evaluation

In [None]:
def eval_data(y_test_some, y_pred_some):
    model_acc = accuracy_score(y_test_some, y_pred_some)
    model_prec = precision_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    model_recall = recall_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    model_f1 = f1_score(y_test_some, y_pred_some, average='weighted', zero_division=0)
    return [model_acc, model_prec, model_recall, model_f1]

In [None]:
# Per category
all_categories = set(Dataset.CONF['CATEGORIES'].values())
categories = ['Climate', 'Spectrogram Data', 'Power consumption', 'Discharge', 'Motion', 'Medical']
index_dataset = test_features_df.shape[1] + 2
all_scores = []

for category in all_categories:
    y_test_cat = []
    y_pred_cat = []
    for i in range(0, len(y_test)):
        ds_name = all_test_info.iloc[i, index_dataset]
        if Dataset.CONF['CATEGORIES'][ds_name] == category:
            y_test_cat.append(y_test[i])
            y_pred_cat.append(y_pred[i])
    cat_scores = eval_data(y_test_cat, y_pred_cat)
    if (category in categories): print((category, cat_scores))
    all_scores.append(cat_scores)

In [None]:
#Global
print(np.array(all_scores).mean(0))