In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import gc

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from General import *
from ReadingTheDataUtils import *
from Classifiers import *
from PreproccesUtils import *
from PerformanceEvalutionUtils import *

# Make the notebook automatically reload external python modules
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

ROOT_PATH = Path('..')
SRC_PATH = Path('.')
DATA_PATH = ROOT_PATH / 'data'
CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_features_labels'
CSV_SUFFIX = '.features_labels.csv'
ORIGINAL_LABLES_CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_original_labels'
ORIGINAL_LABLES_CSV_SUFFIX = '.original_labels.csv'
FOLD_PATH = DATA_PATH / 'cv_5_folds'

# Data Exploration

In [None]:
%%time
data = pd.read_csv(DATA_PATH / 'dataset.csv', index_col='uuid')

In [None]:
data.label_name.unique()

In [None]:
data.info(verbose=True, null_counts=True)

In [None]:
data['label'] = data['label'].astype('category')

for col in data.columns:
    if col.startswith('discrete'):
        data[col] = data[col].astype('category')

In [None]:
data.info(verbose=True, null_counts=True)

# Model 

### Learn all classifiers by folds 

In [None]:
train_folds_list, test_folds_list = get_folds_list(FOLD_PATH)

In [None]:
train_folds_list, test_folds_list

In [None]:
def get_cols_to_scale(i_X_fold_train):
    cols_to_scale = i_X_fold_train.select_dtypes(exclude=['category']).columns
    categorial_cols = i_X_fold_train.select_dtypes(include=['category']).columns
    
    return cols_to_scale, categorial_cols

def get_X_data_dummies(i_X_fold_train, i_X_fold_test):
    X_train = pd.get_dummies(i_X_fold_train, dummy_na=False).reset_index(drop=True, inplace=False)
    X_test = pd.get_dummies(i_X_fold_test, dummy_na=False).reset_index(drop=True, inplace=False)
    
    return X_train, X_test

def data_preproccessing(i_data, i_train_folds_list, i_test_folds_list):
    # Get folds as DataFrame
    train_fold_df, test_fold_df = get_folds_train_and_test(i_data, i_train_folds_list, i_test_folds_list)
    X_fold_train, X_fold_test, y_fold_train, y_fold_test = \
            split_fold_data_to_features_and_labels(train_fold_df, test_fold_df)
    
    # Decide the scaling method, Numeric standard, categorial one-hot
    cols_to_scale, categorial_cols = get_cols_to_scale(X_fold_train)
    standard_X_train, standard_X_test = standard_data_scaling(X_fold_train[cols_to_scale], X_fold_test[cols_to_scale])
    dummies_X_train, dummies_X_test = get_X_data_dummies(X_fold_train[categorial_cols], X_fold_test[categorial_cols])
    
    # Concat the numeric with the categorial
    X_train = pd.concat([standard_X_train, dummies_X_train], axis=1)
    X_test = pd.concat([standard_X_test, dummies_X_test], axis=1)
    
    return X_train, X_test, y_fold_train, y_fold_test

In [None]:
# %%time
# classifiers = dict()

# for train_fold_lst, test_fold_lst in zip(train_folds_list, test_folds_list):
#     gc.collect()
    
#     # Preprocess the data
#     standard_X_train, standard_X_test, y_fold_train, y_fold_test = \
#                 data_preproccessing(data, train_fold_lst, test_fold_lst)
    
# #     handle_nulls_in_X(standard_X_train, standard_X_test)
    
#     # Learn all models sync
# #     models_tuple = learn_all_models_sync(standard_X_train, y_fold_train)

#     # Learn all model async
#     models_tuple = learn_all_models_async(standard_X_train, y_fold_train, i_c_score_grid_search=True)
    
#     # Put each fold result in the classifiers dict
#     classifiers.setdefault('single_sensor_classifier', []).append(models_tuple[0])
#     classifiers.setdefault('early_fusion_classifier', []).append(models_tuple[1])

In [None]:
# classifiers

In [None]:
import pickle

filename = 'classifiers_dict'

In [None]:
# # Write the model to a file
# with open(filename, 'wb') as outfile:
#     pickle.dump(classifiers, outfile)

# Performance evaluation

In [None]:
# Read the model from a file
with open(filename, 'rb') as infile:
    classifiers_dict = pickle.load(infile)

In [None]:
classifiers_dict

In [None]:
from Classifiers import NUM_OF_LABELS

def get_class_weights_in_data(i_y):
    y = np.array(i_y)
    class_counts = np.unique(y, return_counts=True)[1]
    
    if len(class_counts) != NUM_OF_LABELS:
        raise Exception(f"class_counts length is diffrent from {NUM_OF_LABELS}")
        
    return class_counts

def get_test_weights(i_y_test):
    return get_class_weights_in_data(i_y_test)
    

test_class_weights = np.zeros((NUM_OF_LABELS, ), dtype='int')


for i, (train_fold_lst, test_fold_lst) in enumerate(zip(train_folds_list, test_folds_list)):
    gc.collect()
    
    is_first_iteration = i == 0

    
    # Preprocess the data
    standard_X_train, standard_X_test, y_fold_train, y_fold_test = \
                data_preproccessing(data, train_fold_lst, test_fold_lst)
    
    test_class_weights = test_class_weights + get_test_weights(y_fold_test)
    print(test_class_weights)
    
    if is_first_iteration:
        single_sensors_states_dict, EF_states, LFA_states, LFL_states = get_states_arrays(data)
    
    # Get rows with all sensors data
    feature_names = get_feature_names(standard_X_train, ['label'])  # In this case we use the data with our label!
    sensor_names = get_sensor_names(feature_names)
    
    y_fold_test.reset_index(drop=True, inplace=True)

    for _, sensor_cols_name_in_data in sensor_names.items():
        mask = standard_X_test[sensor_cols_name_in_data].isnull().all(1)
        idx_to_drop = standard_X_test[mask].index
        
        standard_X_test.drop(idx_to_drop, axis=0, inplace=True)
        y_fold_test.drop(idx_to_drop, axis=0, inplace=True)

    handle_nulls_in_X(standard_X_train, standard_X_test)
    
    single_sensor_models = classifiers_dict['single_sensor_classifier'][i]
    
    # Eeach single sensor model
    get_single_sensor_state(single_sensors_states_dict, standard_X_test, y_fold_test, single_sensor_models)

    # LFA
    LFA_states = LFA_states + get_LFA_state(standard_X_test, y_fold_test,  single_sensor_models)
        
    # LFL
    LFL_states = LFL_states + get_LFL_state(standard_X_train, y_fold_train, standard_X_test, y_fold_test, single_sensor_models)


test_class_weights = (test_class_weights / test_class_weights.sum())

In [None]:
LFL_states

In [None]:
test_class_weights

### Micro averaging

In [None]:
evalutions_dict = dict()

# Single sensor
feature_names = get_feature_names(standard_X_test, ['label'])  # In this case we using the data with our label!
sensor_names = get_sensor_names(feature_names)

for sensor_name in sensor_names:
    single_sesnor_state = single_sensors_states_dict[sensor_name]
    TP, TN, FP, FN = single_sesnor_state[0].sum(), single_sesnor_state[1].sum(), single_sesnor_state[2].sum(), single_sesnor_state[3].sum()
    sensitivity, specifisity, accuracy, precision, BA, F1 =\
        get_evaluations_metric_scores(TP, TN, FP, FN)
    insert_values_to_evaluations_dict(evalutions_dict, sensor_name, 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

# LFA
TP, TN, FP, FN = LFA_states[0].sum(), LFA_states[1].sum(), LFA_states[2].sum(), LFA_states[3].sum()
sensitivity, specifisity, accuracy, precision, BA, F1 =\
        get_evaluations_metric_scores(TP, TN, FP, FN)                              
insert_values_to_evaluations_dict(evalutions_dict, "LFA",
                                  sensitivity, specifisity, accuracy, precision, BA, F1)

# LFL
TP, TN, FP, FN = LFL_states[0].sum(), LFL_states[1].sum(), LFL_states[2].sum(), LFL_states[3].sum()
sensitivity, specifisity, accuracy, precision, BA, F1 =\
        get_evaluations_metric_scores(TP, TN, FP, FN) 
insert_values_to_evaluations_dict(evalutions_dict, "LFL", 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

pd.DataFrame.from_dict(evalutions_dict)

### Macro averaging

In [None]:
evalutions_dict = dict()

# Single sensor
feature_names = get_feature_names(standard_X_test, ['label'])  # In this case we using the data with our label!
sensor_names = get_sensor_names(feature_names)

for sensor_name in sensor_names:
    single_sesnor_state = single_sensors_states_dict[sensor_name]
    scores_arr = np.zeros((6, ), dtype='int')

    for c in range(NUM_OF_LABELS):
        class_state = single_sesnor_state[:, c]
        TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
        scores_arr = scores_arr + get_evaluations_metric_scores(TP, TN, FP, FN)

    scores_arr = scores_arr / NUM_OF_LABELS
    
    sensitivity, specifisity, accuracy, precision, BA, F1 =\
            scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
    insert_values_to_evaluations_dict(evalutions_dict, sensor_name, 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

# LFA
scores_arr = np.zeros((6, ), dtype='int')

for c in range(NUM_OF_LABELS):
    class_state = LFA_states[:, c]
    TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
    scores_arr = scores_arr + get_evaluations_metric_scores(TP, TN, FP, FN)

scores_arr = scores_arr / NUM_OF_LABELS

sensitivity, specifisity, accuracy, precision, BA, F1 =\
    scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
insert_values_to_evaluations_dict(evalutions_dict, "LFA",
                                  sensitivity, specifisity, accuracy, precision, BA, F1)

# LFL
scores_arr = np.zeros((6, ), dtype='int')

for c in range(NUM_OF_LABELS):
    class_state = LFL_states[:, c]
    TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
    scores_arr = scores_arr + get_evaluations_metric_scores(TP, TN, FP, FN)

scores_arr = scores_arr / NUM_OF_LABELS

sensitivity, specifisity, accuracy, precision, BA, F1 =\
        scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
insert_values_to_evaluations_dict(evalutions_dict, "LFL", 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

pd.DataFrame.from_dict(evalutions_dict)

### Weighted averaging

In [None]:
evalutions_dict = dict()

# Single sensor
feature_names = get_feature_names(standard_X_test, ['label'])  # In this case we using the data with our label!
sensor_names = get_sensor_names(feature_names)

for sensor_name in sensor_names:
    single_sesnor_state = single_sensors_states_dict[sensor_name]
    scores_arr = []

    for c in range(NUM_OF_LABELS):
        class_state = single_sesnor_state[:, c]
        TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
        
        scores_arr.append(get_evaluations_metric_scores(TP, TN, FP, FN))

    scores_arr = np.dot(test_class_weights, np.array(scores_arr))

    sensitivity, specifisity, accuracy, precision, BA, F1 =\
            scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
    insert_values_to_evaluations_dict(evalutions_dict, sensor_name, 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

# LFA
scores_arr = []

for c in range(NUM_OF_LABELS):
    class_state = LFA_states[:, c]
    TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
    
    scores_arr.append(get_evaluations_metric_scores(TP, TN, FP, FN))

scores_arr = np.dot(test_class_weights, np.array(scores_arr))
    
sensitivity, specifisity, accuracy, precision, BA, F1 =\
        scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
insert_values_to_evaluations_dict(evalutions_dict, "LFA",
                                  sensitivity, specifisity, accuracy, precision, BA, F1)

# LFL
scores_arr = []

for c in range(NUM_OF_LABELS):
    class_state = LFL_states[:, c]
    TP, TN, FP, FN = class_state[0], class_state[1], class_state[2], class_state[3]
    
    scores_arr.append(get_evaluations_metric_scores(TP, TN, FP, FN))

scores_arr = np.dot(test_class_weights, np.array(scores_arr))

sensitivity, specifisity, accuracy, precision, BA, F1 =\
        scores_arr[0], scores_arr[1], scores_arr[2], scores_arr[3], scores_arr[4], scores_arr[5]                       
insert_values_to_evaluations_dict(evalutions_dict, "LFL", 
                                 sensitivity, specifisity, accuracy, precision, BA, F1)

pd.DataFrame.from_dict(evalutions_dict)

# =====================================

# EF training using the pipelines!

### Train

In [None]:
def get_single_pre_pipe(X):
    cat_cols = X.select_dtypes(include=["category"]).columns
    cat_colds_indices = [X.columns.get_loc(c) for c in cat_cols if c in X_fold_train]
    num_cols = X.select_dtypes(exclude=["category"]).columns
    num_cols_indices = [X.columns.get_loc(c) for c in num_cols if c in X_fold_train]
    
    column_transformer_sacler = ColumnTransformer(
        [
            ('ohe', OneHotEncoder(sparse=False), cat_colds_indices),
            ('scaler', StandardScaler(), num_cols_indices)
        ]
    )
    column_transformer_null_handler = ColumnTransformer(
        [
            ('SimpleImputerCat', SimpleImputer(strategy="most_frequent"), cat_colds_indices),
            ('SimpleImputerNum', SimpleImputer(strategy='mean'), num_cols_indices)
        ]
    )
    pipe = Pipeline(
        [
            ('column_transformer_sacler', column_transformer_sacler),
            ('column_transformer_null_handler', column_transformer_null_handler)
        ], verbose=True
    )
        
    return pipe


In [None]:
ef_models = {
    'pipe': [],
    'lr': [],
    'rf': [],
    'gnb': []
}

### For each fold learn classifier

In [None]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import check_X_y


for train_fold_lst, test_fold_lst in zip(train_folds_list, test_folds_list):
    gc.collect()

    train_fold_df, test_fold_df = get_folds_train_and_test(data, train_fold_lst, test_fold_lst)
    
    X_fold_train, _, y_fold_train, _ = \
            split_fold_data_to_features_and_labels(train_fold_df, test_fold_df)
    
    pre_pipe = get_single_pre_pipe(X_fold_train)
    X = pre_pipe.fit_transform(X_fold_train)
    X, y = check_X_y(X, y_fold_train)
    
    
    # Random Forest
    rf_param_grid = {'max_depth': [8, 16, 32],
                     'class_weight': ['balanced'],
                     'n_estimators': [8, 16, 32]}
    rf_grid = GridSearchCV(RandomForestClassifier(),
                           param_grid=rf_param_grid,
                           verbose=3,
                           n_jobs=-1,
                           scoring='f1_weighted',
                           cv=3)
#     rf_grid = RandomForestClassifier(n_jobs=-1, verbose=3)
#     rf_grid.fit(X, y)

#     # Linear Regression
#     lr_param_grid = {'C': [0.01, 0.1, 1, 10],
#                      'penalty' : ['l1', 'l2'],
#                      'max_iter': [100],
#                      'class_weight': ['balanced']}
#     lr_grid = GridSearchCV(LogisticRegression(), param_grid=lr_param_grid, verbose=5, n_jobs=1, scoring='f1_weighted', cv=None)
#     lr_grid = LogisticRegression(n_jobs=-1, verbose=3)
#     lr_grid.fit(X, y)
    
#     # Gaussian Naive Bayes
#     gnb_param_grid = {}
#     gnb_grid = GridSearchCV(GaussianNB(), param_grid=gnb_param_grid, verbose=5, n_jobs=1, scoring='f1_weighted', cv=None)
    gnb_grid = GaussianNB()
    gnb_grid.fit(X, y)
    
    ef_models['pipe'].append(pre_pipe)
#     ef_models['lr'].append(lr_grid)
#     ef_models['rf'].append(rf_grid)
    ef_models['gnb'].append(gnb_grid)
    
    break

In [None]:
ef_models['gnb'][0].class_count_
ef_models['gnb'][0].class_prior_
ef_models['gnb'][0].classes_
ef_models['gnb'][0].epsilon_
ef_models['gnb'][0].sigma_
ef_models['gnb'][0].theta_

In [None]:
for grid in ef_models['rf']:
    print(grid.best_params_)

### For each fold get model scores

### Score on the train

In [None]:
from sklearn.metrics import f1_score, accuracy_score

model_f1_scores = {
    'lr': [],
    'rf': [],
    'gnb': []
}

model_accuracy_score = {
    'lr': [],
    'rf': [],
    'gnb': []
}

for idx, (train_fold_lst, test_fold_lst) in enumerate(zip(train_folds_list, test_folds_list)):
    gc.collect()

    # Split to X, y
    train_fold_df, test_fold_df = get_folds_train_and_test(data, train_fold_lst, test_fold_lst)
    
    X_fold_train, _, y_fold_train, _ = \
            split_fold_data_to_features_and_labels(train_fold_df, test_fold_df)
    
    X = ef_models['pipe'][idx].transform(X_fold_train)
    X, y = check_X_y(X, y_fold_train)
    
    # predict
#     lr_pred = ef_models['lr'][idx].predict(X)
    rf_pred = ef_models['rf'][idx].predict(X)
#     gnb_pred = ef_models['gnb'][idx].predict(X)
    
    # Save
#     model_f1_scores['lr'].append(f1_score(y, lr_pred, average='weighted', sample_weight=y))
    model_f1_scores['rf'].append(f1_score(y, rf_pred, average='weighted', sample_weight=y))
#     model_f1_scores['gnb'].append(f1_score(y, gnb_pred, average='weighted', sample_weight=y))
    
#     model_accuracy_score['lr'].append(f1_score(y, lr_pred, average='weighted', sample_weight=y))
    model_accuracy_score['rf'].append(f1_score(y, rf_pred, average='weighted', sample_weight=y))
#     model_accuracy_score['gnb'].append(f1_score(y, gnb_pred, average='weighted', sample_weight=y))
    

num_of_folds = (idx + 1)
for model_name, model_scores in model_f1_scores.items():
    model_scores_arr = np.array(model_scores)
    model_score_mean = model_scores_arr.sum() / num_of_folds
    
    print(f'{model_name}: {model_score_mean}')

### Score on the test

In [None]:
from sklearn.metrics import f1_score, accuracy_score

model_f1_scores = {
    'lr': [],
    'rf': [],
    'gnb': []
}

model_accuracy_score = {
    'lr': [],
    'rf': [],
    'gnb': []
}

for idx, (train_fold_lst, test_fold_lst) in enumerate(zip(train_folds_list, test_folds_list)):
    gc.collect()

    # split to X, y
    train_fold_df, test_fold_df = get_folds_train_and_test(data, train_fold_lst, test_fold_lst)
    
    _, X_fold_test, _, y_fold_test = \
            split_fold_data_to_features_and_labels(train_fold_df, test_fold_df)
    
    X = ef_models['pipe'][idx].transform(X_fold_test)
    X, y = check_X_y(X, y_fold_test)
    
    # predict
#     lr_pred = ef_models['lr'][idx].predict(X)
    rf_pred = ef_models['rf'][idx].predict(X)
#     gnb_pred = ef_models['gnb'][idx].predict(X)
    
    # Save
#     model_f1_scores['lr'].append(f1_score(y, lr_pred, average='weighted', sample_weight=y))
    model_f1_scores['rf'].append(f1_score(y, rf_pred, average='weighted', sample_weight=y))
#     model_f1_scores['gnb'].append(f1_score(y, gnb_pred, average='weighted', sample_weight=y))
    
#     model_accuracy_score['lr'].append(f1_score(y, lr_pred, average='weighted', sample_weight=y))
    model_accuracy_score['rf'].append(f1_score(y, rf_pred, average='weighted', sample_weight=y))
#     model_accuracy_score['gnb'].append(f1_score(y, gnb_pred, average='weighted', sample_weight=y))

    
num_of_folds = (idx + 1)

for model_name, model_scores in model_f1_scores.items():
    model_scores_arr = np.array(model_scores)
    model_score_mean = model_scores_arr.sum() / num_of_folds
    
    print(f'{model_name}: {model_score_mean}')

### Compare mean of each label % in the predictions

In [None]:
import matplotlib.pyplot as plt

y_s = 0
lr_s = 0
rf_s = 0
gnb_s = 0

for idx, (train_fold_lst, test_fold_lst) in enumerate(zip(train_folds_list, test_folds_list)):
    gc.collect()

    train_fold_df, test_fold_df = get_folds_train_and_test(data, train_fold_lst, test_fold_lst)
    
    _, X_fold_test, _, y_fold_test = \
            split_fold_data_to_features_and_labels(train_fold_df, test_fold_df)
    
    # TODO: send the right params! 
    X = ef_models['pipe'][idx].transform(X_fold_test)
    X, y = check_X_y(X, y_fold_test)
    
    # pred
    lr_pred = ef_models['lr'][idx].predict(X)
    rf_pred = ef_models['rf'][idx].predict(X)
    gnb_pred = ef_models['gnb'][idx].predict(X)
    
    # count
    _, y_counts = np.unique(y, return_counts=True)
    y_s += y_counts / y_counts.sum()
    
    _, lr_counts = np.unique(lr_pred, return_counts=True)
    lr_s += lr_counts / lr_counts.sum()
    
    _, rf_counts = np.unique(rf_pred, return_counts=True)
    rf_s += rf_counts / rf_counts.sum()
    
    _, gnb_counts = np.unique(gnb_pred, return_counts=True)
    gnb_s += gnb_counts / gnb_counts.sum()
    
    
y_s /= 5
lr_s /= 5
rf_s /= 5
gnb_s /= 5

In [None]:
#plot
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

X = np.arange(y_s.shape[0])

ax.bar(X - 0.20, lr_s, width=0.20)
ax.bar(X + 0.00, rf_s, width=0.20)
ax.bar(X + 0.20, gnb_s, width=0.20)
ax.bar(X + 0.40, y_s, width=0.20)

ax.legend(['lr', 'rf', 'gnb', 'y'])

plt.show()

# ========================================