# ETL 

Lo primero que hacemos es llamar las funciones: `Fitizens_libraries` es la carpeta en la que se encuentan los archivos .py con las funciones:

In [1]:
from fitizens_libraries.load_and_process_training_data import load_training_data
from fitizens_libraries.load_timeseries import load_timeseries_data
from custom_libraries.merge_data import merge_data
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from fitizens_libraries.plot_labeled_sequences import plot_labeled_sequence
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot
from fitizens_libraries.build_dataframe_from_list_of_signals import build_dataframe
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, auc, roc_auc_score
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

Para el proceso de ETL lo que haremos será utilizar la función de `load_training_data`:
1. Basicamente nos busca los archivos JSON en la carpeta zip en la que se encuentren y los carga.
2. Intenta buscar repeticiones falsas y verdaderas del ejercicio 
3. Se filtran las repeticiones falsas para que tengan una duración dentro del rango de las repeticiones reales.

Es importante tener en cuenta que la función me exige unos hiperparametros obligatorios que tengo que indicar:

- signals : list of str
    List of signals to include in the dataframe. The signals must be present in the input data
- target_exercise : str
        Exercise to detect repetitions
- other_exercises : list of str
        List of exercises to use as negative examples

In [None]:
folder_path = "LABELED"
os.makedirs(folder_path, exist_ok=True) #Referenciamos la carpeta LABELED en la que están las carpetas zip con los json
#Ahora voy a iterar en esa carpeta LABELED para obtener la ruta de los archivos, que es el LABELED/NOMBRE y eso lo guardo en una lista
file_names = []
for name in os.listdir(folder_path):
    file_path = f"{folder_path}/{name}"
    file_names.append(file_path)
#Ahora tengo que especificar mis features 
signals = ["accX", "accY", "accZ", "gyroX", "gyroY", "gyroZ", "magnX", "magnY", "magnZ", "linAccX", "linAccY", "linAccZ"]
#Indico mi target
target_exercise="SQUAT"

In [None]:
len(file_names)

In [None]:
data,wk = load_training_data(filelist=file_names[0:5],
                         signals= signals,
                          target_exercise=target_exercise, other_exercises=[], is_peak_minima=True)

In [None]:
#df2=build_dataframe(data)

In [None]:
#df2.head()

In [None]:
data[0]

In [None]:
#Revisamos
#data
len(data)

In [None]:
data[1]['target']

In [None]:
df = merge_data(data)
len(df)

In [None]:
df.head()

In [None]:
df.columns

Ahora voy a convertir esto en un problema de clasificacion binario; para ello, voy a crear una columna de exercise, en la que segun la columna `exercising_periods` me diga si hay o no un squad. Si esa columna tiene un cero, indica que no hay squad, de lo contrario es un squad.

In [None]:
def nueva_columna(exercise):
    if exercise == 0:
        return 'no exercise'
    else:
        return 'squad'
df['squad'] = df['exercising_periods'].apply(nueva_columna)

In [None]:
df.head()

In [None]:
df.info()

# EDA

In [None]:
#revisar porcentaje de valores nulos por columnas
((df.isnull().sum())/len(df))*100

In [None]:
sns.countplot(x=df['squad'], label = "squad")

In [None]:
df.index

In [None]:
fig = px.line(df, x=df.index, y='linAccZ', title='Time serie of exercise linAccZ',color='squad')
fig.show()

In [None]:
fig = px.box(df, y="linAccZ", color="squad",title = 'Distribution of linAccZ vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="linAccY", color="squad",title = 'Distribution of linAccY vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="linAccX", color="squad",title = 'Distribution of linAccX vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accZ_mod", color="squad",title = 'Distribution of accZ_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accX_mod", color="squad",title = 'Distribution of accX_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accY_mod", color="squad",title = 'Distribution of accY_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
timeseries, labels_ranges = load_timeseries_data(filelist=file_names, signals=signals, is_peak_minima=True)

In [None]:
timeseries.columns

In [None]:
plot_labeled_sequence(timeseries, labels_ranges[0:5])

In [None]:
labels_ranges

In [None]:
y = df['exercising_periods']

num_lags=24 # to discuss

plt.plot(y)
plt.show()
plot_acf(y,lags=num_lags)
plt.show()
plot_pacf(y,lags=num_lags,method="ols")
plt.show()

In [None]:
import statsmodels as sm

# Ho: the process is not stationary. We need, at least, a unit root
# H1: the process is stationary. We have to check different models (lags)


adf_test = sm.tsa.stattools.adfuller(y,maxlag=10) 
# print("adf_test", adf_test) # first output "The test statistic", second output "p-value"
print("ADF test for the original series")
print("Statistic Value:" , adf_test[0])
print("p-value:" , adf_test[1])
# print(sm.__version__)

In [None]:
prim_1000 = df.head(1000)

In [None]:
fig = px.line(prim_1000, x=prim_1000.index, y='linAccZ', title='Time serie of exercise linAccZ')
fig.show()

In [None]:
siguientes_2000 = df.tail(len(df) - 1000).head(1500)

In [None]:
fig = px.line(siguientes_2000, x=siguientes_2000.index, y='linAccZ', title='Time serie of exercise linAccZ')
fig.show()

# Data prep for model

In [None]:
df = merge_data(data)
len(df)

In [None]:
serie = pd.concat([item['series'] for item in data])

In [None]:
serie.head()

In [None]:
len(serie)

In [None]:
frames = []
target = []
for item in data:
    frames.append(item['series'])
    target.append(item['target'])

df = pd.concat(frames)
#df['target'] = target

In [None]:
len(frames)

In [None]:
len(target)

In [None]:
Counter(target)

In [None]:
#vamos a crear un dataframe con los promedios
promedios_df = pd.DataFrame()

# Iterar sobre cada diccionario en la lista
for diccionario in data:
    # Convertir el diccionario a un DataFrame
    df = diccionario['series']

    # Calcular el promedio de cada columna y agregarlo al DataFrame de promedios
    promedio_serie = df.mean()
    promedios_df = promedios_df.append(promedio_serie,ignore_index=True)

# Agregar la columna 'target' al DataFrame de promedios
promedios_df['target'] = [diccionario['target'] for diccionario in data]

In [None]:
promedios_df.head()

In [None]:
#verificamos la longitud del dataframe que coincida con los 3805
len(promedios_df)

In [None]:
#verificamos que el target siga desbalanceado y coincidan los numeros
promedios_df.groupby('target').size()

In [None]:
promedios_df.info()

In [None]:
promedios_df.head()

In [None]:
sns.countplot(x=promedios_df['target'], label = "squad")

In [None]:
fig = px.box(promedios_df, y="linAccZ", color="target",title = 'Distribution of linAccZ vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccZ_orig", color="target",title = 'Distribution of linAccZ_orig vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccY", color="target",title = 'Distribution of linAccY vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccY_orig", color="target",title = 'Distribution of linAccY_orig vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccX", color="target",title = 'Distribution of linAccX vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccX_orig", color="target",title = 'Distribution of linAccX_orig vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accZ_mod", color="target",title = 'Distribution of accZ_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accZ_orig", color="target",title = 'Distribution of accZ_orig vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accX_mod", color="target",title = 'Distribution of accX_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accY_mod", color="target",title = 'Distribution of accY_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
sns.pairplot(promedios_df, hue= 'target',vars=["linAccX", "linAccY", "linAccZ"])

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap( promedios_df.corr(), annot = True, cmap ="coolwarm", linewidths = .5)

In [None]:
X = promedios_df.drop('target',axis=1)
y = promedios_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99,stratify=y)
display(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

## SMOTE for balance my data

In [None]:
smote = SMOTE( random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
#check the new sample
import numpy as np
np.unique(y_train_resampled,return_counts=True)

## Feature selection

In [None]:
selected = SelectKBest(score_func=f_classif, k=9)
X_train_selected=selected.fit_transform(X_train_resampled, y_train_resampled)
X_test_selected = selected.transform(X_test) #I have to choose also for the test

In [None]:
selected.get_feature_names_out()

In [None]:
X_train_selected

In [None]:
scores = pd.Series(selected.scores_, index=X.columns)
scores = scores.sort_values(ascending=False)
px.bar(scores, template="none", title="F-Score of features with casual as dependent variable")

In [None]:
# Scale Data
scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_selected) #I only use the selected variables
# apply stanrdadization also to the test
X_test_full = scaler.transform(X_test_selected)

In [None]:
# ahora tengo que hacer la codificacion de la variable target 
label_encoder = LabelEncoder()
# Aplica la transformación a la variable y
y_train_full = label_encoder.fit_transform(y_train_resampled)
y_test= label_encoder.transform(y_test)

# Models

In [None]:
results_hard = {}
results_soft = {}

## SVM

In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
svm = SVC(probability=True)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}


# Realizar la búsqueda aleatoria de hiperparámetros
random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
random_search_svm.fit(X_train_full, y_train_full)

In [None]:
print(random_search_svm.best_params_)

In [None]:
#First evaluate on train
proba_train = random_search_svm.predict_proba(X_train_full)
pred_train = random_search_svm.predict(X_train_full)
print(classification_report(y_train_full,pred_train))

In [None]:
#now I will check with the test
proba_test = random_search_svm.predict_proba(X_test_full)
pred_test = random_search_svm.predict(X_test_full)
print(classification_report(y_test,pred_test))

## XGBOOST

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier()

In [None]:
param_dist = {
    'n_estimators': range(10, 90, 5),
    'max_depth': range(3, 40, 2),
    'min_child_weight': range(1, 10),  
    'gamma': [0, 0.1, 0.2, 0.3],  
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search_XGB = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10, 
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)

In [None]:
#para este modelo uso datos sin escalar 
random_search_XGB.fit(X_train_selected, y_train_full)

In [None]:
print(random_search_XGB.best_params_)

In [None]:
#First evaluate on train
proba_train_XGB = random_search_XGB.predict_proba(X_train_selected)
pred_train_XGB = random_search_XGB.predict(X_train_selected)
print(classification_report(y_train_full,pred_train_XGB))

In [None]:
#now I will check with the test
proba_test_XGB = random_search_XGB.predict_proba(X_test_selected)
pred_test_XGB = random_search_XGB.predict(X_test_selected)
print(classification_report(y_test,pred_test_XGB))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gaus = GaussianNB()
param_dist_NB = {
    'priors': [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2],[0.3, 0.7],[0.7, 0.3]]
}


random_search_NB = RandomizedSearchCV(
    estimator=gaus,
    param_distributions=param_dist_NB,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)

In [None]:
# Entrenar el modelo con la búsqueda aleatoria de hiperparámetros
random_search_NB.fit(X_train_full, y_train_full)

In [None]:
print(random_search_NB.best_params_)

In [None]:
#First evaluate on train
proba_train_NB = random_search_NB.predict_proba(X_train_full)
pred_train_NB = random_search_NB.predict(X_train_full)
print(classification_report(y_train_full,pred_train_NB))

In [None]:
#now I will check with the test
proba_test_NB = random_search_NB.predict_proba(X_test_full)
pred_test_NB = random_search_NB.predict(X_test_full)
print(classification_report(y_test,pred_test_NB))

## Model evaluation

In [None]:
pred_SV = random_search_svm.predict(X_test_full)
proba_SV = random_search_svm.predict_proba(X_test_full)
results_hard["Support_Vector"] = pred_SV
results_soft["Support_Vector"] = proba_SV[:,1]

proba_XGB = random_search_XGB.predict_proba(X_test_selected)
pred_XGB = random_search_XGB.predict(X_test_selected)
results_hard["XGBOOST"] = pred_XGB
results_soft["XGBOOST"] = proba_XGB[:,1]

proba_NB = random_search_NB.predict_proba(X_test_full)
pred_NB = random_search_NB.predict(X_test_full)
results_hard["Naive_Bayes"] = pred_NB
results_soft["Naive_Bayes"] = proba_NB[:,1]

results_hard = pd.DataFrame(results_hard)
results_soft = pd.DataFrame(results_soft)

In [None]:
metrics = {}

metrics["Accuracy"] = {
    "Naive_Bayes": accuracy_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": accuracy_score(y_test, results_hard.Support_Vector),
    "XGBOOST": accuracy_score(y_test, results_hard.XGBOOST)
}
metrics["Precision"] = {
    "Naive_Bayes": precision_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": precision_score(y_test, results_hard.Support_Vector),
    "XGBOOST": precision_score(y_test, results_hard.XGBOOST)
}
metrics["Recall"] = {
    "Naive_Bayes": recall_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": recall_score(y_test, results_hard.Support_Vector),
    "XGBOOST": recall_score(y_test, results_hard.XGBOOST)
}
metrics["F1"] = {
    "Naive_Bayes": f1_score(y_test, results_hard.Naive_Bayes),
    "Support_Vector": f1_score(y_test, results_hard.Support_Vector),
    "XGBOOST": f1_score(y_test, results_hard.XGBOOST)
}

metrics = pd.DataFrame(metrics)
metrics

## ROC Curves

In [None]:
# Datos de FPR y TPR para los tres modelos 
fpr_Naive_Bayes,tpr_Naive_Bayes,_ = roc_curve(y_test, results_soft.Naive_Bayes)
fpr_Support_Vector,tpr_Support_Vector,_ = roc_curve(y_test, results_soft.Support_Vector)
fpr_XGBOOST,tpr_XGBOOST,_ = roc_curve(y_test, results_soft.XGBOOST)

# Calcular el área bajo la curva ROC (AUC) para cada modelo
auc_Naive_Bayes = auc(fpr_Naive_Bayes,tpr_Naive_Bayes)
auc_Support_Vector = auc(fpr_Support_Vector,tpr_Support_Vector)
auc_XGBOOST = auc(fpr_XGBOOST,tpr_XGBOOST)

In [None]:
# Crear la gráfica ROC
plt.figure(figsize=(8, 6))

# Graficar las curvas ROC para los tres modelos
plt.plot(fpr_Naive_Bayes,tpr_Naive_Bayes, label=f'Naive bayes (AUC = {auc_Naive_Bayes:.2f})')
plt.plot(fpr_XGBOOST,tpr_XGBOOST, label=f'Xgboost (AUC = {auc_XGBOOST:.2f})')
plt.plot(fpr_Support_Vector,tpr_Support_Vector, label=f'Support Vector (AUC = {auc_Support_Vector:.2f})')


# Configurar la gráfica
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Línea diagonal para referencia
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos (FPR)')
plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
plt.title('Curva ROC de Modelos')
plt.legend(loc="lower right")

# Mostrar la gráfica
plt.show()

# Try out models with other features

Para este caso solo voy a usar las variables entregadas por el sensor: 
- Raw acceleration.
- Angular Velocity.
- Magnetic Field.
- Linear Acceleration.

In [None]:
X = promedios_df.drop('target',axis=1)
y = promedios_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99,stratify=y)
display(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
smote = SMOTE( random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
selected_columns = ['accX_orig', 'accY_orig', 'accZ_orig', 'gyroX_orig', 'gyroY_orig',
                    'gyroZ_orig', 'magnX_orig', 'magnY_orig', 'magnZ_orig', 
                    'linAccX_orig', 'linAccY_orig', 'linAccZ_orig', 'accX', 'accY',
                    'accZ', 'gyroX', 'gyroY', 'gyroZ', 'magnX', 'magnY', 'magnZ', 
                    'linAccX', 'linAccY', 'linAccZ', 'accX_mod', 'accY_mod', 'accZ_mod', 
                    'gyroX_mod', 'gyroY_mod', 'gyroZ_mod', 'magnX_mod', 'magnY_mod', 
                    'magnZ_mod', 'linAccX_mod', 'linAccY_mod', 'linAccZ_mod']

X_train2_ns = X_train_resampled[selected_columns].copy()
X_test2_ns = X_test[selected_columns].copy()

In [None]:
selected2 = SelectKBest(score_func=f_classif, k=20)
X_train2_selected=selected2.fit_transform(X_train2_ns, y_train_resampled)
X_test2_selected = selected2.transform(X_test2_ns) #I have to choose also for the test
selected2.get_feature_names_out()

In [None]:
# Scale Data
scaler2 = StandardScaler()
X_train2_full = scaler2.fit_transform(X_train2_selected) #I only use the selected variables
# apply stanrdadization also to the test
X_test2_full = scaler2.transform(X_test2_selected)

In [None]:
# ahora tengo que hacer la codificacion de la variable target 
label_encoder = LabelEncoder()
# Aplica la transformación a la variable y
y_train_full = label_encoder.fit_transform(y_train_resampled)
y_test= label_encoder.transform(y_test)

In [None]:
results_hard2 = {}
results_soft2 = {}

## SVM

In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
svm = SVC(probability=True)
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}


# Realizar la búsqueda aleatoria de hiperparámetros
random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
random_search_svm.fit(X_train2_full, y_train_full)

In [None]:
print(random_search_svm.best_params_)

In [None]:
#First evaluate on train
proba_train2 = random_search_svm.predict_proba(X_train2_full)
pred_train2 = random_search_svm.predict(X_train2_full)
print(classification_report(y_train_full,pred_train2))

In [None]:
#now I will check with the test
proba_test2 = random_search_svm.predict_proba(X_test2_full)
pred_test2 = random_search_svm.predict(X_test2_full)
print(classification_report(y_test,pred_test2))

## XGBOOST

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier()
param_dist = {
    'n_estimators': range(10, 90, 5),
    'max_depth': range(3, 40, 2),
    'min_child_weight': range(1, 10),  
    'gamma': [0, 0.1, 0.2, 0.3],  
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search_XGB = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10, 
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
#para este modelo uso datos sin escalar 
random_search_XGB.fit(X_train2_selected, y_train_full)
print(random_search_XGB.best_params_)

In [None]:
#First evaluate on train
proba_train_XGB2 = random_search_XGB.predict_proba(X_train2_selected)
pred_train_XGB2 = random_search_XGB.predict(X_train2_selected)
print(classification_report(y_train_full,pred_train_XGB2))

In [None]:
#now I will check with the test
proba_test_XGB2 = random_search_XGB.predict_proba(X_test2_selected)
pred_test_XGB2 = random_search_XGB.predict(X_test2_selected)
print(classification_report(y_test,pred_test_XGB2))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gaus = GaussianNB()
param_dist_NB = {
    'priors': [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2],[0.3, 0.7],[0.7, 0.3]]
}


random_search_NB = RandomizedSearchCV(
    estimator=gaus,
    param_distributions=param_dist_NB,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
# Entrenar el modelo con la búsqueda aleatoria de hiperparámetros
random_search_NB.fit(X_train2_full, y_train_full)

In [None]:
print(random_search_NB.best_params_)

In [None]:
#First evaluate on train
proba_train_NB2 = random_search_NB.predict_proba(X_train2_full)
pred_train_NB2 = random_search_NB.predict(X_train2_full)
print(classification_report(y_train_full,pred_train_NB2))

In [None]:
#now I will check with the test
proba_test_NB2 = random_search_NB.predict_proba(X_test2_full)
pred_test_NB2 = random_search_NB.predict(X_test2_full)
print(classification_report(y_test,pred_test_NB2))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

logistic_regression = LogisticRegression()

# Definir el espacio de búsqueda de hiperparámetros
param_grid_lr = {
    'C': np.logspace(-3, 3, 7),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search_lr = RandomizedSearchCV(
    estimator=logistic_regression,
    param_distributions=param_grid_lr,
    n_iter=5,
    scoring='roc_auc',
    n_jobs=7,
    return_train_score=True
)

# Ajustar el modelo de regresión logística con búsqueda de hiperparámetros
random_search_lr.fit(X_train2_full, y_train_full)

random_search_lr.best_params_

In [None]:
#First evaluate on train
proba_train_lr = random_search_lr.predict_proba(X_train2_full)
pred_train_lr = random_search_lr.predict(X_train2_full)
print(classification_report(y_train_full,pred_train_lr))

In [None]:
#now I will check with the test
proba_test_lr = random_search_lr.predict_proba(X_test2_full)
pred_test_lr = random_search_lr.predict(X_test2_full)
print(classification_report(y_test,pred_test_lr))

## Model evaluation

In [None]:
pred_SV2 = random_search_svm.predict(X_test2_full)
proba_SV2 = random_search_svm.predict_proba(X_test2_full)
results_hard2["Support_Vector"] = pred_SV2
results_soft2["Support_Vector"] = proba_SV2[:,1]

proba_XGB2 = random_search_XGB.predict_proba(X_test2_selected)
pred_XGB2 = random_search_XGB.predict(X_test2_selected)
results_hard2["XGBOOST"] = pred_XGB2
results_soft2["XGBOOST"] = proba_XGB2[:,1]

proba_NB2 = random_search_NB.predict_proba(X_test2_full)
pred_NB2 = random_search_NB.predict(X_test2_full)
results_hard2["Naive_Bayes"] = pred_NB2
results_soft2["Naive_Bayes"] = proba_NB2[:,1]

proba_lr = random_search_lr.predict_proba(X_test2_full)
pred_lr = random_search_lr.predict(X_test2_full)
results_hard2["Logistic_Regression"] = pred_lr
results_soft2["Logistic_Regression"] = proba_lr[:,1]

results_hard2 = pd.DataFrame(results_hard2)
results_soft2 = pd.DataFrame(results_soft2)

In [None]:
metrics2 = {}

metrics2["Accuracy"] = {
    "Naive_Bayes": accuracy_score(y_test, results_hard2.Naive_Bayes),
    "Support_Vector": accuracy_score(y_test, results_hard2.Support_Vector),
    "XGBOOST": accuracy_score(y_test, results_hard2.XGBOOST),
    "Logistic_Regression": accuracy_score(y_test, results_hard2.Logistic_Regression)

}
metrics2["Precision"] = {
    "Naive_Bayes": precision_score(y_test, results_hard2.Naive_Bayes),
    "Support_Vector": precision_score(y_test, results_hard2.Support_Vector),
    "XGBOOST": precision_score(y_test, results_hard2.XGBOOST),
    "Logistic_Regression": precision_score(y_test, results_hard2.Logistic_Regression)

}
metrics2["Recall"] = {
    "Naive_Bayes": recall_score(y_test, results_hard2.Naive_Bayes),
    "Support_Vector": recall_score(y_test, results_hard2.Support_Vector),
    "XGBOOST": recall_score(y_test, results_hard2.XGBOOST),
    "Logistic_Regression": recall_score(y_test, results_hard2.Logistic_Regression)

}
metrics2["F1"] = {
    "Naive_Bayes": f1_score(y_test, results_hard2.Naive_Bayes),
    "Support_Vector": f1_score(y_test, results_hard2.Support_Vector),
    "XGBOOST": f1_score(y_test, results_hard2.XGBOOST),
    "Logistic_Regression": f1_score(y_test, results_hard2.Logistic_Regression)

}

metrics2 = pd.DataFrame(metrics2)
metrics2


## ROC Curves

In [None]:
# Datos de FPR y TPR para los tres modelos 
fpr_Naive_Bayes2,tpr_Naive_Bayes2,_ = roc_curve(y_test, results_soft2.Naive_Bayes)
fpr_Support_Vector2,tpr_Support_Vector2,_ = roc_curve(y_test, results_soft2.Support_Vector)
fpr_XGBOOST2,tpr_XGBOOST2,_ = roc_curve(y_test, results_soft2.XGBOOST)
fpr_Logistic_Regression,tpr_Logistic_Regression,_ = roc_curve(y_test, results_soft2.Logistic_Regression)


# Calcular el área bajo la curva ROC (AUC) para cada modelo
auc_Naive_Bayes2 = auc(fpr_Naive_Bayes2,tpr_Naive_Bayes2)
auc_Support_Vector2 = auc(fpr_Support_Vector2,tpr_Support_Vector2)
auc_XGBOOST2 = auc(fpr_XGBOOST2,tpr_XGBOOST2)
auc_Logistic_Regression = auc(fpr_Logistic_Regression,tpr_Logistic_Regression)

In [None]:
# Crear la gráfica ROC
plt.figure(figsize=(8, 6))

# Graficar las curvas ROC para los tres modelos
plt.plot(fpr_Naive_Bayes2,tpr_Naive_Bayes2, label=f'Naive bayes (AUC = {auc_Naive_Bayes2:.2f})')
plt.plot(fpr_XGBOOST2,tpr_XGBOOST2, label=f'Xgboost (AUC = {auc_XGBOOST2:.2f})')
plt.plot(fpr_Support_Vector2,tpr_Support_Vector2, label=f'Support Vector (AUC = {auc_Support_Vector2:.2f})')
plt.plot(fpr_Logistic_Regression,tpr_Logistic_Regression, label=f'Logistic_Regression (AUC = {auc_Logistic_Regression:.2f})')



# Configurar la gráfica
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Línea diagonal para referencia
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos (FPR)')
plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
plt.title('Curva ROC de Modelos')
plt.legend(loc="lower right")

# Mostrar la gráfica
plt.show()

# Nuevas variables

Ahora voy a probar con el promedio, desv. estandar y la mediana

In [None]:
# Crear un dataframe con los promedios, desviación estándar y mediana

nuevo_df = pd.DataFrame()
import warnings
# Iterar sobre cada diccionario en la lista
for diccionario in data:
    warnings.filterwarnings("ignore")

    # Convertir el diccionario a un DataFrame
    df = diccionario['series']

    # Calcular el promedio, la desviación estándar y la mediana de cada columna y agregarlos al DataFrame de promedios
    promedio_serie = df.mean()
    std_serie = df.std()
    median_serie = df.median()
    nuevo_df = nuevo_df.append(pd.concat([promedio_serie.add_prefix('mean_'), std_serie.add_prefix('std_'), median_serie.add_prefix('median_')]), ignore_index=True)

# Agregar la columna 'target' al DataFrame de promedios
nuevo_df['target'] = [diccionario['target'] for diccionario in data]

In [None]:
nuevo_df.head()

In [None]:
list(nuevo_df.columns)

In [None]:
#verificamos la longitud del dataframe que coincida con los 3805
len(nuevo_df)

In [None]:
#verificamos que el target siga desbalanceado y coincidan los numeros
nuevo_df.groupby('target').size()

In [None]:
sns.countplot(x=nuevo_df['target'], label = "squad")

In [None]:
fig = px.box(nuevo_df, y=["mean_linAccZ", "std_linAccZ", "median_linAccZ"], color="target", 
             labels={"value": "Acceleration", "target": "Target"},
             title='Distribution of Acceleration vs Target Variable')

fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(nuevo_df, y=["mean_accZ_orig", "std_accZ_orig", "median_accZ_orig"], color="target", 
             labels={"value": "Acceleration", "target": "Target"},
             title='Distribution of Acceleration vs Target Variable')

fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(nuevo_df, y=["mean_accY_orig", "std_accY_orig", "median_accY_orig"], color="target", 
             labels={"value": "Acceleration", "target": "Target"},
             title='Distribution of Acceleration vs Target Variable')

fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(nuevo_df, y=["mean_gyroZ_orig", "std_gyroZ_orig", "median_gyroZ_orig"], color="target", 
             labels={"value": "Acceleration", "target": "Target"},
             title='Distribution of Acceleration vs Target Variable')

fig.update_traces(quartilemethod="exclusive")
fig.show()

## Prep data

In [None]:
X = nuevo_df.drop('target',axis=1)
y = nuevo_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99,stratify=y)
display(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
smote = SMOTE( random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
selected_columns = [
    'mean_accX_orig', 'mean_accY_orig', 'mean_accZ_orig', 'mean_gyroX_orig', 'mean_gyroY_orig', 'mean_gyroZ_orig',
    'mean_magnX_orig', 'mean_magnY_orig', 'mean_magnZ_orig', 'mean_linAccX_orig', 'mean_linAccY_orig', 'mean_linAccZ_orig',
    'mean_accX', 'mean_accY', 'mean_accZ', 'mean_gyroX', 'mean_gyroY', 'mean_gyroZ',
    'mean_magnX', 'mean_magnY', 'mean_magnZ', 'mean_linAccX', 'mean_linAccY', 'mean_linAccZ', 'mean_accX_mod',
    'mean_accY_mod', 'mean_accZ_mod', 'mean_gyroX_mod', 'mean_gyroY_mod', 'mean_gyroZ_mod', 'mean_magnX_mod',
    'mean_magnY_mod', 'mean_magnZ_mod', 'mean_linAccX_mod', 'mean_linAccY_mod', 'mean_linAccZ_mod',
    'std_accX_orig', 'std_accY_orig', 'std_accZ_orig', 'std_gyroX_orig', 'std_gyroY_orig', 'std_gyroZ_orig',
    'std_magnX_orig', 'std_magnY_orig', 'std_magnZ_orig', 'std_linAccX_orig', 'std_linAccY_orig', 'std_linAccZ_orig',
    'std_accX', 'std_accY', 'std_accZ', 'std_gyroX', 'std_gyroY', 'std_gyroZ', 'std_magnX',
    'std_magnY', 'std_magnZ', 'std_linAccX', 'std_linAccY', 'std_linAccZ', 'std_accX_mod', 'std_accY_mod', 'std_accZ_mod',
    'std_gyroX_mod', 'std_gyroY_mod', 'std_gyroZ_mod', 'std_magnX_mod', 'std_magnY_mod', 'std_magnZ_mod',
    'std_linAccX_mod', 'std_linAccY_mod', 'std_linAccZ_mod', 'median_accX_orig', 'median_accY_orig',
    'median_accZ_orig', 'median_gyroX_orig', 'median_gyroY_orig', 'median_gyroZ_orig', 'median_magnX_orig',
    'median_magnY_orig', 'median_magnZ_orig', 'median_linAccX_orig', 'median_linAccY_orig', 'median_linAccZ_orig',
    'median_accX', 'median_accY', 'median_accZ', 'median_gyroX', 'median_gyroY',
    'median_gyroZ', 'median_magnX', 'median_magnY', 'median_magnZ', 'median_linAccX', 'median_linAccY',
    'median_linAccZ', 'median_accX_mod', 'median_accY_mod', 'median_accZ_mod', 'median_gyroX_mod', 'median_gyroY_mod',
    'median_gyroZ_mod', 'median_magnX_mod', 'median_magnY_mod', 'median_magnZ_mod', 'median_linAccX_mod',
    'median_linAccY_mod', 'median_linAccZ_mod'
]

X_train3_ns = X_train_resampled[selected_columns].copy()
X_test3_ns = X_test[selected_columns].copy()

In [None]:
selected2 = SelectKBest(score_func=f_classif, k=30)
X_train3_selected=selected2.fit_transform(X_train3_ns, y_train_resampled)
X_test3_selected = selected2.transform(X_test3_ns) #I have to choose also for the test
selected2.get_feature_names_out()

Aca cambié las features con las que se entrenaron inicialmente los modelos

In [None]:
selected_columns2 = ['mean_accX', 'mean_accZ', 'mean_linAccZ', 'mean_accZ_mod',
       'mean_gyroX_mod', 'mean_linAccX_mod', 'std_accZ', 'std_gyroX', 'std_gyroZ',
       'std_magnX', 'std_linAccX', 'std_accZ_mod', 'std_linAccX_mod',
       'median_accX', 'median_accZ', 'median_linAccZ', 'median_accZ_mod',
       'median_gyroX_mod', 'median_linAccX_mod']
X_train3_ns = X_train_resampled[selected_columns2].copy()
X_test3_ns = X_test[selected_columns2].copy()

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap( X_train3_ns.corr(), annot = True, cmap ="coolwarm", linewidths = .5)

In [None]:
# Scale Data
scaler3 = StandardScaler()
joblib.dump(scaler3,'scaler3.pkl')
X_train3_full = scaler3.fit_transform(X_train3_ns) #I only use the selected variables
# apply stanrdadization also to the test
X_test3_full = scaler3.transform(X_test3_ns)

In [None]:
# ahora tengo que hacer la codificacion de la variable target 
label_encoder = LabelEncoder()
# Aplica la transformación a la variable y
y_train_full = label_encoder.fit_transform(y_train_resampled)
y_test= label_encoder.transform(y_test)

In [None]:
results_hard3 = {}
results_soft3 = {}

## SVM

In [None]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
svm = SVC(probability=True)
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}


# Realizar la búsqueda aleatoria de hiperparámetros
random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
random_search_svm.fit(X_train3_full, y_train_full)

In [None]:
print(random_search_svm.best_params_)

In [None]:
#First evaluate on train
proba_train3 = random_search_svm.predict_proba(X_train3_full)
pred_train3 = random_search_svm.predict(X_train3_full)
print(classification_report(y_train_full,pred_train3))

In [None]:
#now I will check with the test
proba_test3 = random_search_svm.predict_proba(X_test3_full)
pred_test3 = random_search_svm.predict(X_test3_full)
print(classification_report(y_test,pred_test3))

## XGBOOST

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier()
param_dist = {
    'n_estimators': range(10, 90, 5),
    'max_depth': range(3, 40, 2),
    'min_child_weight': range(1, 10),  
    'gamma': [0, 0.1, 0.2, 0.3],  
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search_XGB = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10, 
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
#para este modelo uso datos sin escalar 
random_search_XGB.fit(X_train3_ns, y_train_full)
print(random_search_XGB.best_params_)

In [None]:
#First evaluate on train
proba_train_XGB3 = random_search_XGB.predict_proba(X_train3_ns)
pred_train_XGB3 = random_search_XGB.predict(X_train3_ns)
print(classification_report(y_train_full,pred_train_XGB3))

In [None]:
#now I will check with the test
proba_test_XGB3 = random_search_XGB.predict_proba(X_test3_selected)
pred_test_XGB3 = random_search_XGB.predict(X_test3_selected)
print(classification_report(y_test,pred_test_XGB3))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gaus = GaussianNB()
param_dist_NB = {
    'priors': [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2],[0.3, 0.7],[0.7, 0.3]]
}


random_search_NB = RandomizedSearchCV(
    estimator=gaus,
    param_distributions=param_dist_NB,
    n_iter=5,  
    scoring="roc_auc",  
    n_jobs=7,
    return_train_score=True
)
# Entrenar el modelo con la búsqueda aleatoria de hiperparámetros
random_search_NB.fit(X_train3_full, y_train_full)

In [None]:
print(random_search_NB.best_params_)

In [None]:
#First evaluate on train
proba_train_NB3 = random_search_NB.predict_proba(X_train3_full)
pred_train_NB3 = random_search_NB.predict(X_train3_full)
print(classification_report(y_train_full,pred_train_NB3))

In [None]:
#now I will check with the test
proba_test_NB3 = random_search_NB.predict_proba(X_test3_full)
pred_test_NB3 = random_search_NB.predict(X_test3_full)
print(classification_report(y_test,pred_test_NB3))

In [None]:
import joblib  
modelo2 = joblib.dump(random_search_NB, 'modeloNB.pkl')

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

logistic_regression = LogisticRegression()

# Definir el espacio de búsqueda de hiperparámetros
param_grid_lr = {
    'C': np.logspace(-3, 3, 7),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Realizar la búsqueda aleatoria de hiperparámetros
random_search_lr = RandomizedSearchCV(
    estimator=logistic_regression,
    param_distributions=param_grid_lr,
    n_iter=10,
    scoring='roc_auc',
    n_jobs=7,
    return_train_score=True
)

# Ajustar el modelo de regresión logística con búsqueda de hiperparámetros
random_search_lr.fit(X_train3_full, y_train_full)

random_search_lr.best_params_


In [None]:
#First evaluate on train
proba_train_lr = random_search_lr.predict_proba(X_train3_full)
pred_train_lr = random_search_lr.predict(X_train3_full)
print(classification_report(y_train_full,pred_train_lr))

In [None]:
#now I will check with the test
proba_test_lr = random_search_lr.predict_proba(X_test3_full)
pred_test_lr = random_search_lr.predict(X_test3_full)
print(classification_report(y_test,pred_test_lr))

In [None]:
import joblib  
modelo3 = joblib.dump(random_search_lr, 'modeloLR.pkl')

## Model evaluation

In [None]:
#pred_SV3 = random_search_svm.predict(X_test3_full)
#proba_SV3 = random_search_svm.predict_proba(X_test3_full)
#results_hard3["Support_Vector"] = pred_SV3
#results_soft3["Support_Vector"] = proba_SV3[:,1]

proba_XGB3 = random_search_XGB.predict_proba(X_test3_selected)
pred_XGB3 = random_search_XGB.predict(X_test3_selected)
results_hard3["XGBOOST"] = pred_XGB3
results_soft3["XGBOOST"] = proba_XGB3[:,1]

proba_NB3 = random_search_NB.predict_proba(X_test3_full)
pred_NB3 = random_search_NB.predict(X_test3_full)
results_hard3["Naive_Bayes"] = pred_NB3
results_soft3["Naive_Bayes"] = proba_NB3[:,1]

proba_lr = random_search_lr.predict_proba(X_test3_full)
pred_lr = random_search_lr.predict(X_test3_full)
results_hard3["Logistic_Regression"] = pred_lr
results_soft3["Logistic_Regression"] = proba_lr[:,1]


results_hard3 = pd.DataFrame(results_hard3)
results_soft3 = pd.DataFrame(results_soft3)

In [None]:
metrics3 = {}

metrics3["Accuracy"] = {
    "Naive_Bayes": accuracy_score(y_test, results_hard3.Naive_Bayes),
    #"Support_Vector": accuracy_score(y_test, results_hard3.Support_Vector),
    "XGBOOST": accuracy_score(y_test, results_hard3.XGBOOST),
    "Logistic_Regression": accuracy_score(y_test, results_hard3.Logistic_Regression)
}
metrics3["Precision"] = {
    "Naive_Bayes": precision_score(y_test, results_hard3.Naive_Bayes),
    #"Support_Vector": precision_score(y_test, results_hard3.Support_Vector),
    "XGBOOST": precision_score(y_test, results_hard3.XGBOOST),
    "Logistic_Regression": precision_score(y_test, results_hard3.Logistic_Regression)
}
metrics3["Recall"] = {
    "Naive_Bayes": recall_score(y_test, results_hard3.Naive_Bayes),
    #"Support_Vector": recall_score(y_test, results_hard3.Support_Vector),
    "XGBOOST": recall_score(y_test, results_hard3.XGBOOST),
    "Logistic_Regression": recall_score(y_test, results_hard3.Logistic_Regression)
}
metrics3["F1"] = {
    "Naive_Bayes": f1_score(y_test, results_hard3.Naive_Bayes),
    #"Support_Vector": f1_score(y_test, results_hard3.Support_Vector),
    "XGBOOST": f1_score(y_test, results_hard3.XGBOOST),
    "Logistic_Regression": f1_score(y_test, results_hard3.Logistic_Regression)
}

metrics3 = pd.DataFrame(metrics3)
metrics3

## ROC Curves

In [None]:
# Datos de FPR y TPR para los dos modelos 
fpr_Naive_Bayes3,tpr_Naive_Bayes3,_ = roc_curve(y_test, results_soft3.Naive_Bayes)
#fpr_Support_Vector3,tpr_Support_Vector3,_ = roc_curve(y_test, results_soft3.Support_Vector)
fpr_XGBOOST3,tpr_XGBOOST3,_ = roc_curve(y_test, results_soft3.XGBOOST)
fpr_Logistic_Regression,tpr_Logistic_Regression,_ = roc_curve(y_test, results_soft3.Logistic_Regression)


# Calcular el área bajo la curva ROC (AUC) para cada modelo
auc_Naive_Bayes3 = auc(fpr_Naive_Bayes3,tpr_Naive_Bayes3)
#auc_Support_Vector3 = auc(fpr_Support_Vector3,tpr_Support_Vector3)
auc_XGBOOST3 = auc(fpr_XGBOOST3,tpr_XGBOOST3)
auc_Logistic_Regression = auc(fpr_Logistic_Regression,tpr_Logistic_Regression)

In [None]:
# Crear la gráfica ROC
plt.figure(figsize=(8, 6))

# Graficar las curvas ROC para los tres modelos
plt.plot(fpr_Naive_Bayes3,tpr_Naive_Bayes3, label=f'Naive bayes (AUC = {auc_Naive_Bayes3:.2f})')
plt.plot(fpr_XGBOOST3,tpr_XGBOOST3, label=f'Xgboost (AUC = {auc_XGBOOST3:.2f})')
plt.plot(fpr_Logistic_Regression,tpr_Logistic_Regression, label=f'Logistic_Regression (AUC = {auc_Logistic_Regression:.2f})')
#plt.plot(fpr_Support_Vector3,tpr_Support_Vector3, label=f'Support Vector (AUC = {auc_Support_Vector3:.2f})')


# Configurar la gráfica
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Línea diagonal para referencia
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos (FPR)')
plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
plt.title('Curva ROC de Modelos')
plt.legend(loc="lower right")

# Mostrar la gráfica
plt.show()

In [None]:
import joblib  
joblib.dump(random_search_XGB, 'modeloXGB.pkl')

In [None]:
# ¡¡¡¡¡¡¡¡¡¡NOOOOOOOOOO LO CORRAS QUE SE EXPLOTA!!!!!!!!!!!
target = list()
frames = list()
for i in data:
    target=i['target']
    for j in data:
        frames.append(j['series'])
    df = pd.concat(frames,axis = 1)
    df['target'] = target

# Pycaret

In [2]:
from pycaret.classification import *

In [3]:
def create_custom_dataframe(series):
    df =  series[["linAccX", "linAccY", "linAccZ", "gyroX", "gyroY", "gyroZ", "magnX", "magnY", "magnZ"]]
    return df

In [4]:
def create_training_data_stats(df, target):
    stats_dict = {}
    series = create_custom_dataframe(df)
    
    for column in series.columns:
        mean = series[column].mean()
        std = series[column].std()
        median = series[column].median()
        

        stats_dict[f"{column}_mean"] = mean
        stats_dict[f"{column}_std"] = std
        stats_dict[f"{column}_median"] = median
          
    stats_dict["target"] = target
    return stats_dict

In [5]:
folder_path = "LABELED"
os.makedirs(folder_path, exist_ok=True)
file_names = [f"{folder_path}/{name}" for name in os.listdir(folder_path)]
signals = ['accX', 'accY', 'accZ', 'gyroX', 'gyroY', 'gyroZ', 'magnX', 'magnY', 'magnZ', 'linAccX', 'linAccY', 'linAccZ']

data, wk = load_training_data(filelist=file_names,
                         signals= signals,
                          target_exercise="SQUAT", other_exercises=[],is_peak_minima=True )

In [6]:
data_info = [create_training_data_stats(info["series"], info["target"]) for info in data] #calling the two functions
data_custom = pd.DataFrame(data_info) #creating the DF
data_custom.head()

Unnamed: 0,linAccX_mean,linAccX_std,linAccX_median,linAccY_mean,linAccY_std,linAccY_median,linAccZ_mean,linAccZ_std,linAccZ_median,gyroX_mean,...,magnX_mean,magnX_std,magnX_median,magnY_mean,magnY_std,magnY_median,magnZ_mean,magnZ_std,magnZ_median,target
0,0.475693,0.444189,0.604449,-0.064382,0.201904,-0.017634,1.084862,3.31106,1.841822,6.446957,...,113.922673,1.511774,113.740783,105.64576,1.579641,106.327918,162.709051,8.196471,164.514308,SQUAT
1,0.43628,0.48513,0.641912,0.050774,0.227051,0.083403,0.956848,2.846323,1.38445,0.362799,...,113.81063,1.261197,113.589972,105.646582,1.241413,106.177881,164.105538,8.303264,165.637205,SQUAT
2,0.457903,0.470508,0.604933,0.078159,0.160814,0.123268,1.098518,3.17366,2.082821,6.661235,...,113.426752,1.120535,112.962573,105.811957,0.993088,106.015616,165.059245,7.472576,166.777286,SQUAT
3,0.504729,0.50976,0.7194,0.059016,0.133882,0.035024,1.12745,3.123841,2.709761,2.916382,...,113.179327,1.466456,112.808564,104.957545,1.209527,105.155497,170.951369,9.970589,173.638573,SQUAT
4,0.470926,0.481927,0.688152,0.058959,0.174069,0.108836,0.974739,2.707455,1.910544,3.956676,...,112.578127,1.332054,112.138872,105.10028,1.191062,105.369625,171.290676,9.659595,173.157878,SQUAT


In [7]:
data_dev = data_custom.sample(frac=0.95, random_state=786)
data_prod = data_custom.drop(data_dev.index)

data_dev.reset_index(inplace=True, drop=True)
data_prod.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data_dev.shape))
print('Simulated data For Production ' + str(data_prod.shape))

Data for Modeling: (3615, 28)
Simulated data For Production (190, 28)


In [8]:
model = setup(
    # Basic options
    data = data_dev,
    target = "target",
    train_size = 0.8, 
    preprocess = True,
    
    # Dealing with multicollinearity
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.9,
        
    # Feature normalization with outliers
    normalize = True,
    normalize_method = 'robust',
        
    # Paralellization options
    n_jobs = - 1,
    use_gpu = False,
    
    # Imbalance Dataset
    fix_imbalance=True,

    remove_outliers= True, 
    outliers_threshold= 0.03,
    
    # Feature Importance
    feature_selection = True,
    n_features_to_select= 10
)

[LightGBM] [Info] Number of positive: 2159, number of negative: 2159
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 4318, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Description,Value
0,Session id,8089
1,Target,target
2,Target type,Binary
3,Target mapping,"NO_EXERCISE: 0, SQUAT: 1"
4,Original data shape,"(3615, 28)"
5,Transformed data shape,"(5041, 11)"
6,Transformed train set shape,"(4318, 11)"
7,Transformed test set shape,"(723, 11)"
8,Numeric features,27
9,Preprocess,True


In [9]:
models = compare_models(sort="F1", fold=2)
models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.9959,0.994,0.9959,0.9959,0.9958,0.988,0.9881,2.715
et,Extra Trees Classifier,0.9955,0.9994,0.9955,0.9955,0.9955,0.9871,0.9871,0.505
catboost,CatBoost Classifier,0.9955,0.9985,0.9955,0.9955,0.9955,0.9871,0.9871,3.01
rf,Random Forest Classifier,0.9948,0.9967,0.9948,0.9948,0.9948,0.9852,0.9852,3.12
gbc,Gradient Boosting Classifier,0.9941,0.9974,0.9941,0.9941,0.9941,0.9831,0.9832,0.775
lightgbm,Light Gradient Boosting Machine,0.9941,0.9981,0.9941,0.9941,0.9941,0.9832,0.9832,0.56
nb,Naive Bayes,0.9931,0.9973,0.9931,0.9931,0.9931,0.9802,0.9802,2.775
ada,Ada Boost Classifier,0.992,0.997,0.992,0.9921,0.9921,0.9773,0.9774,0.5
xgboost,Extreme Gradient Boosting,0.9917,0.9979,0.9917,0.9918,0.9917,0.9764,0.9765,0.495
knn,K Neighbors Classifier,0.9914,0.9952,0.9914,0.9915,0.9914,0.9754,0.9756,2.77


In [10]:
cnt_models_df = pull()
cnt_models_df

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.9959,0.994,0.9959,0.9959,0.9958,0.988,0.9881,2.715
et,Extra Trees Classifier,0.9955,0.9994,0.9955,0.9955,0.9955,0.9871,0.9871,0.505
catboost,CatBoost Classifier,0.9955,0.9985,0.9955,0.9955,0.9955,0.9871,0.9871,3.01
rf,Random Forest Classifier,0.9948,0.9967,0.9948,0.9948,0.9948,0.9852,0.9852,3.12
gbc,Gradient Boosting Classifier,0.9941,0.9974,0.9941,0.9941,0.9941,0.9831,0.9832,0.775
lightgbm,Light Gradient Boosting Machine,0.9941,0.9981,0.9941,0.9941,0.9941,0.9832,0.9832,0.56
nb,Naive Bayes,0.9931,0.9973,0.9931,0.9931,0.9931,0.9802,0.9802,2.775
ada,Ada Boost Classifier,0.992,0.997,0.992,0.9921,0.9921,0.9773,0.9774,0.5
xgboost,Extreme Gradient Boosting,0.9917,0.9979,0.9917,0.9918,0.9917,0.9764,0.9765,0.495
knn,K Neighbors Classifier,0.9914,0.9952,0.9914,0.9915,0.9914,0.9754,0.9756,2.77


In [11]:
clf = create_model('et', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9952,0.9991,0.9952,0.9952,0.9952,0.9862,0.9862
1,0.9945,0.998,0.9945,0.9945,0.9945,0.9842,0.9842
Mean,0.9948,0.9986,0.9948,0.9948,0.9948,0.9852,0.9852
Std,0.0003,0.0006,0.0003,0.0003,0.0003,0.001,0.001


In [12]:
tuned_clf = tune_model(clf, optimize = 'F1', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9965,0.9986,0.9965,0.9965,0.9965,0.9901,0.9901
1,0.9945,0.9992,0.9945,0.9945,0.9945,0.9842,0.9842
Mean,0.9955,0.9989,0.9955,0.9955,0.9955,0.9871,0.9871
Std,0.001,0.0003,0.001,0.001,0.001,0.0029,0.0029


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [13]:
print("Total of features: ", len(tuned_clf.feature_importances_))

Total of features:  10


In [14]:
evaluate_model(tuned_clf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [15]:
model_final = finalize_model(tuned_clf)

In [16]:
save_model(model_final, 'squat_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['linAccX_mean', 'linAccX_std',
                                              'linAccX_median', 'linAccY_mean',
                                              'linAccY_std', 'linAccY_median',
                                              'linAccZ_mean', 'linAccZ_std',
                                              'linAccZ_median', 'g...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                                       max_leaf_nodes=None, max_samples=No

# Use in production

In [17]:
pipeline = load_model(model_name="squat_model")

Transformation Pipeline and Model Successfully Loaded


In [18]:
prediction = predict_model(pipeline, data_prod, raw_score=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9895,0.9987,0.9895,0.9896,0.9894,0.9744,0.9747
