In [10]:
import pickle

from ExtraSensoryModels.Models.early_fusion import EarlyFusion
from ExtraSensoryModels.Interfaces.ExtraSensoryAbstractModel import ExtraSensoryAbstractModel 
from sklearn.metrics import classification_report
from utils.TransformerUtils import get_X_y
from utils.ReadingTheDataUtils import get_dataframe

test_df = r"data/folds/train_2.csv"
logistic_path = r"data/models/best_models_per_fold/early_fusion_02-03-2020_10-58-43_fold_2"
#random_forest_path = r"data/models/best_models_per_fold/early_fusion_02-03-2020_10-58-43_fold_3"

with open(logistic_path, 'rb') as model:
    early_fusion_logistic = pickle.load(model)
    
# with open(random_forest_path, 'rb') as model:
#     early_fusion_random_forest = pickle.load(model)
test_df = get_dataframe(test_df)
X, y = get_X_y(test_df, early_fusion_logistic.get_pipe(), is_fitted=True)
logistic_predictions = early_fusion_logistic.predict(X)
print(classification_report(y, logistic_predictions))

              precision    recall  f1-score   support

           0       0.52      0.40      0.45     73246
           1       0.19      0.32      0.24     11080
           2       0.11      0.37      0.17      4553
           3       0.09      0.61      0.16      2267
           4       0.66      0.59      0.62    118886
           5       0.61      0.61      0.61     71905

    accuracy                           0.53    281937
   macro avg       0.36      0.48      0.38    281937
weighted avg       0.58      0.53      0.55    281937



In [None]:
from utils.ReadingTheDataUtils import get_dataframe
from utils.TransformerUtils import get_X_y

#test_df_path = r"C:\Users\itama\Desktop\courses\Project\Extra-Sensory-Yarden\src\Extra-Sensory\data\folds\test.csv"

test_df = get_dataframe(test_df)
X, y = get_X_y(test_df, early_fusion_logistic.get_pipe(), is_fitted=True)
logistic_predictions = early_fusion_logistic.predict(X)
#random_forest_predictions = early_fusion_random_forest.predict(X)



In [None]:
mask = logistic_predictions != random_forest_predictions
print(X.shape)
model_disagreament = X[mask]
model_disagreament_labels = y[mask]
random_forest_disagrement = random_forest_predictions[mask]
loggistic_regression_disagrement = logistic_predictions[mask]

In [None]:
#plot

import matplotlib.pyplot as plt
import numpy as np

y_counts = np.bincount(model_disagreament_labels)
random_counts = np.bincount(random_forest_disagrement)
logistic_count = np.bincount(loggistic_regression_disagrement)

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])

X = np.arange(y_counts.shape[0])

ax.bar(X - 0.25, random_counts, width=0.25)
ax.bar(X + 0.00, logistic_count, width=0.25)
ax.bar(X + 0.25, y_counts, width=0.25)

ax.legend(['random forest', 'logistic', 'y'])

plt.show()


In [None]:
logistic_mask = y == logistic_predictions
random_mask = y == random_forest_predictions

logistic_correctness_vector = logistic_mask + 0
random_correctness_vector = random_mask + 0

In [None]:
import pandas as pd
confusion_matrix = np.zeros((2,2), dtype=int)
np.add.at(confusion_matrix, [logistic_correctness_vector, random_correctness_vector], 1)
confusion_matrix = confusion_matrix / np.sum(confusion_matrix)
df = pd.DataFrame(confusion_matrix)

In [None]:
import seaborn as sn

sn.heatmap(df, annot=True, cmap="Blues")
plt.ylabel('Logistic regression')
plt.xlabel('Random forest')
plt.show()

In [None]:
for label in range(6):
    label_mask = y == label
    logistic = logistic_correctness_vector[label_mask]
    random = random_correctness_vector[label_mask]
    confusion_matrix = np.zeros((2,2), dtype=int)
    np.add.at(confusion_matrix, [logistic, random], 1)
    confusion_matrix = confusion_matrix / np.sum(confusion_matrix)
    df = pd.DataFrame(confusion_matrix)
    sn.heatmap(df, annot=True, cmap="Blues")
    plt.ylabel('Logistic regression')
    plt.xlabel('Random forest')
    print("Confution matrix for label", label)
    plt.show()

In [1]:
from utils.ReadingTheDataUtils import get_dataframe
from utils.TransformerUtils import get_X_y
import pandas as pd
import numpy as np

test_df_path = r"C:\Users\itama\Desktop\courses\Project\Extra-Sensory-Yarden\src\Extra-Sensory\data\folds\test.csv"
train_df = r"C:\Users\itama\Desktop\courses\Project\Extra-Sensory-Yarden\src\Extra-Sensory\data\folds\train.csv"

X_train = get_dataframe(train_df)
y_train = X_train['label']
X_train.drop(['label'], axis=1, inplace=True)

X_test = get_dataframe(test_df_path)
y_test = X_test['label']
X_test.drop(['label'], axis=1, inplace=True)

X_train = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
y = np.array(y).astype('uint8')                            

#data = pd.concat([train_df, test_df])

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
standar_scaler = StandardScaler()

num_cols = X_train.select_dtypes(exclude=['category']).columns
cat_cols = X_train.select_dtypes(include=['category']).columns
X_train_numeric = pd.DataFrame(standar_scaler.fit_transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
X_train_cat = X_train[cat_cols]

X_train = pd.concat([X_train_numeric, X_train_cat], axis=1)
X_train.fillna(0, inplace=True)
X = X_train

In [3]:
import itertools

from utils.GeneralUtils import ConfigManager
from utils.TransformerUtils import get_X_y
from sklearn.model_selection import GridSearchCV, GroupKFold
from ExtraSensoryModels.Interfaces.ExtraSensoryAbstractModel import ExtraSensoryAbstractModel
import ExtraSensoryModels.HyperParameterLearner 
from sklearn.linear_model import LogisticRegression

Cs = [0.001, 0.01, 0.1, 1, 10]
class_weight = ['balanced']
solver = ["lbfgs"]
warm_start = [True]
max_iter = [1000]
multi_class = ["multinomial"]


groups = pd.Series(X_train.index).astype('category').cat.codes
cv = GroupKFold(n_splits=5)

param_grid={'C': Cs,
            'solver': solver,
            'class_weight': class_weight,
            'max_iter': max_iter
           }

for split in [7,10]:
    k_folds_groups = GroupKFold(n_splits=split)
    best_estimator = GridSearchCV(estimator=LogisticRegression(),
                                  param_grid=param_grid,
                                  cv=k_folds_groups,
                                  refit=False,
                                  scoring='f1_macro',
                                  n_jobs=-1,
                                  verbose=10
                                  )
    best_estimator.fit(X, y, groups=groups)
    pd.DataFrame(best_estimator.cv_results_).to_csv(f'results_groups_{split}.csv')

Fitting 7 folds for each of 5 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 30.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 62.0min
[Parallel(n_jobs=-1)]: Done  24 out of  35 | elapsed: 111.1min remaining: 50.9min
[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed: 119.1min remaining: 29.8min
[Parallel(n_jobs=-1)]: Done  32 out of  35 | elapsed: 147.7min remaining: 13.8min
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 149.8min finished


Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 20.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 78.8min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 133.8min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 185.0min remaining: 40.6min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 222.8min remaining: 14.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 226.4min finished


In [6]:
pd.read_csv(f'results_groups_7.csv')

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_max_iter,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,mean_test_score,std_test_score,rank_test_score
0,0,811.759853,57.127577,5.912037,0.988736,0.001,balanced,1000,lbfgs,"{'C': 0.001, 'class_weight': 'balanced', 'max_...",0.136069,0.156294,0.192862,0.10657,0.115504,0.205387,0.196732,0.158324,0.037486,1
1,1,1123.306167,106.808928,4.222789,0.630612,0.01,balanced,1000,lbfgs,"{'C': 0.01, 'class_weight': 'balanced', 'max_i...",0.133208,0.154287,0.186392,0.111484,0.1125,0.19822,0.197184,0.155992,0.035364,5
2,2,2208.103003,179.974628,4.549732,0.442204,0.1,balanced,1000,lbfgs,"{'C': 0.1, 'class_weight': 'balanced', 'max_it...",0.134978,0.151105,0.185718,0.113036,0.113203,0.201608,0.197524,0.156532,0.035411,4
3,3,2875.444809,81.644406,3.713355,0.104521,1.0,balanced,1000,lbfgs,"{'C': 1, 'class_weight': 'balanced', 'max_iter...",0.138195,0.149974,0.185554,0.112484,0.113503,0.202963,0.197394,0.156946,0.035422,3
4,4,2422.051271,438.316854,2.024292,0.687706,10.0,balanced,1000,lbfgs,"{'C': 10, 'class_weight': 'balanced', 'max_ite...",0.140611,0.149745,0.185296,0.112038,0.113439,0.203012,0.197336,0.157151,0.035315,2


In [7]:
pd.read_csv(f'results_groups_10.csv')

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_max_iter,param_solver,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0,707.574773,116.753963,3.853309,0.33767,0.001,balanced,1000,lbfgs,"{'C': 0.001, 'class_weight': 'balanced', 'max_...",...,0.13362,0.146588,0.147988,0.173164,0.079776,0.134261,0.131152,0.134322,0.024943,5
1,1,1268.845062,111.10385,3.512181,0.350783,0.01,balanced,1000,lbfgs,"{'C': 0.01, 'class_weight': 'balanced', 'max_i...",...,0.131235,0.141168,0.150494,0.175925,0.088771,0.135657,0.133791,0.135116,0.022804,4
2,2,2430.88856,93.054886,3.214911,0.630764,0.1,balanced,1000,lbfgs,"{'C': 0.1, 'class_weight': 'balanced', 'max_it...",...,0.129843,0.140009,0.151868,0.179627,0.089847,0.136829,0.13528,0.135405,0.02326,3
3,3,3121.065937,140.311149,3.008109,0.351626,1.0,balanced,1000,lbfgs,"{'C': 1, 'class_weight': 'balanced', 'max_iter...",...,0.129497,0.139197,0.1523,0.191766,0.089658,0.137209,0.135599,0.136439,0.025868,2
4,4,2859.687326,345.076724,1.897184,0.895183,10.0,balanced,1000,lbfgs,"{'C': 10, 'class_weight': 'balanced', 'max_ite...",...,0.129718,0.139278,0.152472,0.196578,0.089417,0.137259,0.13552,0.136789,0.027119,1
