# ETL 

In [1]:
from fitizens_libraries.load_and_process_training_data import load_training_data
from fitizens_libraries.load_timeseries import load_timeseries_data
from custom_libraries.merge_data import merge_data
from fitizens_libraries.plot_labeled_sequences import plot_labeled_sequence
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot
from fitizens_libraries.build_dataframe_from_list_of_signals import build_dataframe
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, auc, roc_auc_score
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from pycaret.classification import *

# Creating Auxiliary Functions

def create_custom_dataframe(series):
    df =  series[["linAccX", "linAccY", "linAccZ", "gyroX", "gyroY", "gyroZ", "magnX", "magnY", "magnZ"]]
    return df

In [2]:
def create_custom_dataframe(series):
    df =  series[["linAccX", "linAccY", "linAccZ", "gyroX", "gyroY", "gyroZ", "magnX", "magnY", "magnZ"]]
    return df

In [3]:
def create_training_data_stats(df, target):
    stats_dict = {}
    series = create_custom_dataframe(df)
    
    for column in series.columns:
        mean = series[column].mean()
        std = series[column].std()
        median = series[column].median()
        skewness = series[column].skew()
        kurtosis = series[column].kurtosis()
        min_val = series[column].min()
        max_val = series[column].max()
        range_val = max_val - min_val
        quartile_25 = np.percentile(series[column], 25)
        quartile_75 = np.percentile(series[column], 75)
        iqr = quartile_75 - quartile_25
    

        stats_dict[f"{column}_mean"] = mean
        stats_dict[f"{column}_std"] = std
        stats_dict[f"{column}_median"] = median
        stats_dict[f"{column}_skewness"] = skewness
        stats_dict[f"{column}_kurtosis"] = kurtosis
        stats_dict[f"{column}_min"] = min_val
        stats_dict[f"{column}_max"] = max_val
        stats_dict[f"{column}_range"] = range_val
        stats_dict[f"{column}_quartile_25"] = quartile_25
        stats_dict[f"{column}_quartile_75"] = quartile_75
        stats_dict[f"{column}_iqr"] = iqr
          
    stats_dict["target"] = target
    return stats_dict

## LOADING DATA

In [4]:
folder_path = "SIT_UP"
os.makedirs(folder_path, exist_ok=True)
file_names = [f"{folder_path}/{name}" for name in os.listdir(folder_path)]
signals = ['accX', 'accY', 'accZ', 'gyroX', 'gyroY', 'gyroZ', 'magnX', 'magnY', 'magnZ', 'linAccX', 'linAccY', 'linAccZ']

data, wk = load_training_data(filelist=file_names,
                         signals= signals,
                          target_exercise="SIT_UP", other_exercises=[],is_peak_minima=True)
#data[0]

## CREATING NEW DATA

In [5]:
data_info = [create_training_data_stats(info["series"], info["target"]) for info in data] #calling the two functions
data_custom = pd.DataFrame(data_info) #creating the DF
data_custom.head()

Unnamed: 0,linAccX_mean,linAccX_std,linAccX_median,linAccX_skewness,linAccX_kurtosis,linAccX_min,linAccX_max,linAccX_range,linAccX_quartile_25,linAccX_quartile_75,...,magnZ_median,magnZ_skewness,magnZ_kurtosis,magnZ_min,magnZ_max,magnZ_range,magnZ_quartile_25,magnZ_quartile_75,magnZ_iqr,target
0,0.388809,0.279572,0.366983,-0.042204,-1.106181,-0.123898,0.817209,0.941108,0.1933,0.647342,...,-133.774247,0.967084,-0.38436,-141.222322,-71.998178,69.224145,-140.462345,-109.304928,31.157417,SIT_UP
1,0.372222,0.279694,0.385858,-0.014031,-1.209217,-0.098039,0.830182,0.928221,0.124379,0.603874,...,-132.795818,0.790671,-0.854375,-143.417692,-67.077372,76.340321,-142.322815,-100.734702,41.588113,SIT_UP
2,0.504073,0.243515,0.544523,-0.406295,-1.021661,-0.028379,0.835814,0.864194,0.342034,0.718465,...,-133.472091,0.855847,-0.719118,-142.445329,-70.284936,72.160392,-141.375007,-103.942723,37.432284,SIT_UP
3,0.517968,0.328093,0.542358,-0.216895,-0.994067,-0.067893,1.084048,1.151941,0.297515,0.795397,...,-132.888973,0.866073,-0.70264,-141.558153,-64.709851,76.848303,-140.867072,-101.929367,38.937705,SIT_UP
4,0.597573,0.268827,0.677022,-0.881459,-0.419613,-0.040277,0.954099,0.994376,0.446828,0.775891,...,-129.410661,0.867999,-0.70875,-137.873103,-65.389778,72.483325,-136.585987,-99.882959,36.703028,SIT_UP


In [None]:
len(data_custom)

# ML WITH PYCARET

In [6]:
data_dev = data_custom.sample(frac=0.95, random_state=786)
data_prod = data_custom.drop(data_dev.index)

data_dev.reset_index(inplace=True, drop=True)
data_prod.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data_dev.shape))
print('Simulated data For Production ' + str(data_prod.shape))

Data for Modeling: (505, 100)
Simulated data For Production (27, 100)


In [7]:
model = setup(
    # Basic options
    data = data_dev,
    target = "target",
    train_size = 0.8, 
    preprocess = True,
    
    # Dealing with multicollinearity
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.9,
        
    # Feature normalization with outliers
    normalize = True,
    normalize_method = 'robust',
        
    # Paralellization options
    n_jobs = - 1,
    use_gpu = False,
    
    # Imbalance Dataset
    fix_imbalance=True,

    remove_outliers= True, 
    outliers_threshold= 0.03,
    
    # Feature Importance
    feature_selection = True,
    n_features_to_select= 10
)

[LightGBM] [Info] Number of positive: 333, number of negative: 333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14030
[LightGBM] [Info] Number of data points in the train set: 666, number of used features: 63
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Description,Value
0,Session id,2444
1,Target,target
2,Target type,Binary
3,Target mapping,"NO_EXERCISE: 0, SIT_UP: 1"
4,Original data shape,"(505, 100)"
5,Transformed data shape,"(767, 11)"
6,Transformed train set shape,"(666, 11)"
7,Transformed test set shape,"(101, 11)"
8,Numeric features,99
9,Preprocess,True


In [8]:
models = compare_models(sort="F1", fold=2)
models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9926,0.9989,0.9926,0.9927,0.9926,0.9702,0.9703,3.015
et,Extra Trees Classifier,0.9901,0.999,0.9901,0.9904,0.9902,0.9603,0.9607,0.505
catboost,CatBoost Classifier,0.9876,0.9993,0.9876,0.9881,0.9877,0.9507,0.9511,2.59
nb,Naive Bayes,0.9827,0.9981,0.9827,0.986,0.9834,0.936,0.9396,2.715
qda,Quadratic Discriminant Analysis,0.9827,0.9985,0.9827,0.9828,0.9823,0.9266,0.9284,2.635
xgboost,Extreme Gradient Boosting,0.9802,0.9969,0.9802,0.9824,0.9808,0.9243,0.9264,0.345
ada,Ada Boost Classifier,0.9802,0.9968,0.9802,0.9799,0.98,0.9177,0.9181,0.345
lr,Logistic Regression,0.9777,0.9982,0.9777,0.9799,0.9783,0.9139,0.916,3.105
knn,K Neighbors Classifier,0.9752,0.9836,0.9752,0.978,0.9759,0.9048,0.9076,2.62
lightgbm,Light Gradient Boosting Machine,0.9752,0.9976,0.9752,0.9763,0.9756,0.9022,0.9028,0.48


In [9]:
cnt_models_df = pull()
cnt_models_df

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9926,0.9989,0.9926,0.9927,0.9926,0.9702,0.9703,3.015
et,Extra Trees Classifier,0.9901,0.999,0.9901,0.9904,0.9902,0.9603,0.9607,0.505
catboost,CatBoost Classifier,0.9876,0.9993,0.9876,0.9881,0.9877,0.9507,0.9511,2.59
nb,Naive Bayes,0.9827,0.9981,0.9827,0.986,0.9834,0.936,0.9396,2.715
qda,Quadratic Discriminant Analysis,0.9827,0.9985,0.9827,0.9828,0.9823,0.9266,0.9284,2.635
xgboost,Extreme Gradient Boosting,0.9802,0.9969,0.9802,0.9824,0.9808,0.9243,0.9264,0.345
ada,Ada Boost Classifier,0.9802,0.9968,0.9802,0.9799,0.98,0.9177,0.9181,0.345
lr,Logistic Regression,0.9777,0.9982,0.9777,0.9799,0.9783,0.9139,0.916,3.105
knn,K Neighbors Classifier,0.9752,0.9836,0.9752,0.978,0.9759,0.9048,0.9076,2.62
lightgbm,Light Gradient Boosting Machine,0.9752,0.9976,0.9752,0.9763,0.9756,0.9022,0.9028,0.48


In [10]:
clf = create_model('et', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9901,0.999,0.9901,0.9901,0.9901,0.9597,0.9597
1,0.995,1.0,0.995,0.9952,0.9951,0.9802,0.9803
Mean,0.9926,0.9995,0.9926,0.9927,0.9926,0.9699,0.97
Std,0.0025,0.0005,0.0025,0.0026,0.0025,0.0102,0.0103


In [11]:
tuned_clf = tune_model(clf, optimize = 'F1', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9901,0.9972,0.9901,0.9901,0.9901,0.9597,0.9597
1,0.9901,0.999,0.9901,0.9907,0.9902,0.9609,0.9616
Mean,0.9901,0.9981,0.9901,0.9904,0.9902,0.9603,0.9607
Std,0.0,0.0009,0.0,0.0003,0.0001,0.0006,0.0009


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [12]:
print("Total of features: ", len(tuned_clf.feature_importances_))

Total of features:  10


In [13]:
evaluate_model(tuned_clf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [14]:
model_final = finalize_model(tuned_clf)

In [15]:
save_model(model_final, 'situp_upModel')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['linAccX_mean', 'linAccX_std',
                                              'linAccX_median',
                                              'linAccX_skewness',
                                              'linAccX_kurtosis', 'linAccX_min',
                                              'linAccX_max', 'linAccX_range',
                                              'linAccX_qua...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                             

# USE IN PRODUCTION

In [16]:
pipeline = load_model(model_name="situp_upModel")

Transformation Pipeline and Model Successfully Loaded


In [17]:
prediction = predict_model(pipeline, data_prod, raw_score=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.963,1.0,0.963,0.9753,0.9663,0.7805,0.8


In [None]:
data_custom['target']

In [None]:
data_custom['target'].count()

Mapping to transform it into a binqary problem ready to do the ML models:

In [None]:
mapping = {'SIT_UP': 1, 'NO_EXERCISE': 0}

# Mapping the values in the 'target' column:
data_custom['target'] = data_custom['target'].map(mapping)

In [None]:
data_custom.head()

In [None]:
data_custom['target']

In [None]:
data_custom['target'].count()

In [None]:
data_custom.info()

# EDA

In [None]:
fig = px.box(data_custom, y="linAccZ_mean", color="target",title = 'Distribution of linAccZ_mean vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccY_mean", color="target",title = 'Distribution of linAccY_mean vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccX_mean", color="target",title = 'Distribution of linAccX_mean vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccZ_std", color="target",title = 'Distribution of linAccZ_std vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccY_std", color="target",title = 'Distribution of linAccY_std vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccX_std", color="target",title = 'Distribution of linAccX_std vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccZ_skewness", color="target",title = 'Distribution of linAccZ_skewness vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccY_skewness", color="target",title = 'Distribution of linAccY_skewness vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(data_custom, y="linAccX_skewness", color="target",title = 'Distribution of linAccX_skewness vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

# Data prep for model

In [None]:
len(data_custom)

In [None]:
#target imbalanced classes
data_custom.groupby('target').size()

In [None]:
sns.countplot(x=data_custom['target'], label = "pushup")

In [None]:
#sns.pairplot(data_custom, hue= 'target',vars=["linAccX_mean", "linAccY_mean", "linAccZ_mean"])

## TREATING THE IMBALANCE 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


# Separate the minority and majority classes
minority_class = data_custom[data_custom['target'] == 1]  
majority_class = data_custom[data_custom['target'] == 0]  

# Downsample the majority class 
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)

# Concatenate BOTH
balanced_data = pd.concat([minority_class, majority_downsampled])

# SPLIT the data into features (X) and target variable (y)
X = balanced_data.drop('target', axis=1)  # Assuming 'target' is the column to be predicted
y = balanced_data['target']

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Display shapes of the split datasets
print("Shapes - X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape)

## KBEST + SCALER

In [None]:
selected = SelectKBest(score_func=f_classif, k=9)
X_train_selected=selected.fit_transform(X_train, y_train)
X_test_selected = selected.transform(X_test) 
selected.get_feature_names_out()

In [None]:
# Scale Data
scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_selected) #ONLY USING SELECTED VARIABLES
joblib.dump(scaler,'scaler_situp.pkl')
X_test_full = scaler.transform(X_test_selected)

##SAVING MODELS

In [None]:
results_hard = {}
results_soft = {}

## SVM

In [None]:
import sklearn
sklearn.metrics.get_scorer_names()

In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
svm = SVC(probability=True)

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear']}


# Realizar la búsqueda aleatoria de hiperparámetros
random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
random_search_svm.fit(X_train_full, y_train)

In [None]:
import joblib  
joblib.dump(random_search_svm, 'situp_model.pkl')

In [None]:
#Evaluate: TRAIN
proba_train = random_search_svm.predict_proba(X_train_full)
pred_train = random_search_svm.predict(X_train_full)
print(classification_report(y_train,pred_train))

In [None]:
#Evaluate: TEST
proba_test = random_search_svm.predict_proba(X_test_full)
pred_test = random_search_svm.predict(X_test_full)
print(classification_report(y_test,pred_test))

## ANOTHER APPROACH

In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# Perform GridSearchCV
grid_search = GridSearchCV(svm, param_grid, scoring='f1', cv=6)  
grid_search.fit(X_train_full, y_train)  

# Get the best parameters and best F1 score
best_params = grid_search.best_params_
best_f1 = grid_search.best_score_

# Use the best model for prediction on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_full)

# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred)

print("Best Parameters:", best_params)
print("Best F1 Score (Cross-validation):", best_f1)
print("F1 Score on Test Set:", f1_test)


In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 
              'gamma': [1, 0.1, 0.01], 
              'kernel': ['linear']}

grid= GridSearchCV(SVC(), param_grid, refit=True, verbose= 10)

grid.fit(X_train_full, y_train)

In [None]:
grid_predictions= grid.predict(X_test_full)
print(classification_report(y_test, grid_predictions))

## LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# Instantiate the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)

# Fit the model on the resampled training data
logistic_model.fit(X_train_full, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test_full)

# Evaluate the model
print(classification_report(y_test, y_pred))

## XGBOOST

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier()

In [None]:
param_dist = {
    'n_estimators': range(1500, 1800, 2000), 
    'max_depth': range(10, 15, 20),
    'booster': ['gbtree', 'gblinear'],
    'min_child_weight': range(1, 2, 3),  
    'learning_rate': [0.1, 0.15, 0.20],
    'gamma': [0, 0.1, 0.2, 0.3]
}

#hyperparameters
random_search_XGB = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=2, #4
    scoring="roc_auc",  
    n_jobs=5, #7
    return_train_score=True
)

In [None]:
# w/o scaling 
random_search_XGB.fit(X_train_selected, y_train)

In [None]:
print(random_search_XGB.best_params_)

In [None]:
# train
proba_train_XGB = random_search_XGB.predict_proba(X_train_selected)
pred_train_XGB = random_search_XGB.predict(X_train_selected)
print(classification_report(y_train,pred_train_XGB))

In [None]:
# test
proba_test_XGB = random_search_XGB.predict_proba(X_test_selected)
pred_test_XGB = random_search_XGB.predict(X_test_selected)
print(classification_report(y_test,pred_test_XGB))

## NAIVE BAYES

In [None]:
from sklearn.naive_bayes import GaussianNB
gaus = GaussianNB()
param_dist_NB = {
    'priors': [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2],[0.3, 0.7],[0.7, 0.3]]
}


random_search_NB = RandomizedSearchCV(
    estimator=gaus,
    param_distributions=param_dist_NB,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)

In [None]:
# Entrenar el modelo con la búsqueda aleatoria de hiperparámetros
random_search_NB.fit(X_train_full, y_train)

In [None]:
print(random_search_NB.best_params_)

In [None]:
#First evaluate on train
proba_train_NB = random_search_NB.predict_proba(X_train_full)
pred_train_NB = random_search_NB.predict(X_train_full)
print(classification_report(y_train,pred_train_NB))

In [None]:
#now I will check with the test
proba_test_NB = random_search_NB.predict_proba(X_test_full)
pred_test_NB = random_search_NB.predict(X_test_full)
print(classification_report(y_test,pred_test_NB))

## Model evaluation

In [None]:
pred_SV = random_search_svm.predict(X_test_full)
proba_SV = random_search_svm.predict_proba(X_test_full)
results_hard["Support_Vector"] = pred_SV
results_soft["Support_Vector"] = proba_SV[:,1]

proba_logistic = logistic_model.predict_proba(X_test_full)
pred_logistic = logistic_model.predict(X_test_full)
results_hard["Logistic_Regr"] = pred_logistic
results_soft["Logistic_Regr"] = proba_logistic[:,1]

proba_XGB = random_search_XGB.predict_proba(X_test_selected)
pred_XGB = random_search_XGB.predict(X_test_selected)
results_hard["XGBOOST"] = pred_XGB
results_soft["XGBOOST"] = proba_XGB[:,1]

proba_NB = random_search_NB.predict_proba(X_test_full)
pred_NB = random_search_NB.predict(X_test_full)
results_hard["Naive_Bayes"] = pred_NB
results_soft["Naive_Bayes"] = proba_NB[:,1]

results_hard = pd.DataFrame(results_hard)
results_soft = pd.DataFrame(results_soft)

In [None]:
metrics = {}

metrics["Accuracy"] = {
    "Naive_Bayes": accuracy_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": accuracy_score(y_test, results_hard.Support_Vector),
    "XGBOOST": accuracy_score(y_test, results_hard.XGBOOST),
    "Logistic_Regr": accuracy_score(y_test, results_hard.Logistic_Regr)
}
metrics["Precision"] = {
    "Naive_Bayes": precision_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": precision_score(y_test, results_hard.Support_Vector),
    "XGBOOST": precision_score(y_test, results_hard.XGBOOST),
    "Logistic_Regr": precision_score(y_test, results_hard.Logistic_Regr)
}
metrics["Recall"] = {
    "Naive_Bayes": recall_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": recall_score(y_test, results_hard.Support_Vector),
    "XGBOOST": recall_score(y_test, results_hard.XGBOOST),
    "Logistic_Regr": recall_score(y_test, results_hard.Logistic_Regr)
}
metrics["F1"] = {
    "Naive_Bayes": f1_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": f1_score(y_test, results_hard.Support_Vector),
    "XGBOOST": f1_score(y_test, results_hard.XGBOOST),
    "Logistic_Regr": f1_score(y_test, results_hard.Logistic_Regr)
}

metrics = pd.DataFrame(metrics)
metrics

## ROC Curves

In [None]:
# Datos de FPR y TPR para los tres modelos 
fpr_Naive_Bayes,tpr_Naive_Bayes,_ = roc_curve(y_test, results_soft.Naive_Bayes)
fpr_Support_Vector,tpr_Support_Vector,_ = roc_curve(y_test, results_soft.Support_Vector)
fpr_XGBOOST,tpr_XGBOOST,_ = roc_curve(y_test, results_soft.XGBOOST)
fpr_Logistic_Reg,tpr_Logistic_Regr,_ = roc_curve(y_test, results_soft.Logistic_Regr)

# Calcular el área bajo la curva ROC (AUC) para cada modelo
auc_Naive_Bayes = auc(fpr_Naive_Bayes,tpr_Naive_Bayes)
auc_Support_Vector = auc(fpr_Support_Vector,tpr_Support_Vector)
auc_XGBOOST = auc(fpr_XGBOOST,tpr_XGBOOST)
auc_Logistic_Regr = auc(fpr_Logistic_Reg,tpr_Logistic_Regr)

In [None]:
# Crear la gráfica ROC
plt.figure(figsize=(8, 6))

# Graficar las curvas ROC para los tres modelos
plt.plot(fpr_Naive_Bayes,tpr_Naive_Bayes, label=f'Naive bayes (AUC = {auc_Naive_Bayes:.2f})')
plt.plot(fpr_XGBOOST,tpr_XGBOOST, label=f'Xgboost (AUC = {auc_XGBOOST:.2f})')
plt.plot(fpr_Support_Vector,tpr_Support_Vector, label=f'Support Vector (AUC = {auc_Support_Vector:.2f})')
plt.plot(fpr_Logistic_Reg,tpr_Logistic_Regr, label=f'Support Vector (AUC = {auc_Logistic_Regr:.2f})')


# Configurar la gráfica
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Línea diagonal para referencia
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos (FPR)')
plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
plt.title('Curva ROC de Modelos')
plt.legend(loc="lower right")

# Mostrar la gráfica
plt.show()