# ETL 

Lo primero que hacemos es llamar las funciones: `Fitizens_libraries` es la carpeta en la que se encuentan los archivos .py con las funciones:

In [None]:
from fitizens_libraries.load_and_process_training_data import load_training_data
from fitizens_libraries.load_timeseries import load_timeseries_data
from custom_libraries.merge_data import merge_data
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from fitizens_libraries.plot_labeled_sequences import plot_labeled_sequence
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot
from fitizens_libraries.build_dataframe_from_list_of_signals import build_dataframe
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif

Para el proceso de ETL lo que haremos será utilizar la función de `load_training_data`:
1. Basicamente nos busca los archivos JSON en la carpeta zip en la que se encuentren y los carga.
2. Intenta buscar repeticiones falsas y verdaderas del ejercicio 
3. Se filtran las repeticiones falsas para que tengan una duración dentro del rango de las repeticiones reales.

Es importante tener en cuenta que la función me exige unos hiperparametros obligatorios que tengo que indicar:

- signals : list of str
    List of signals to include in the dataframe. The signals must be present in the input data
- target_exercise : str
        Exercise to detect repetitions
- other_exercises : list of str
        List of exercises to use as negative examples

In [None]:
folder_path = "LABELED"
os.makedirs(folder_path, exist_ok=True) #Referenciamos la carpeta LABELED en la que están las carpetas zip con los json
#Ahora voy a iterar en esa carpeta LABELED para obtener la ruta de los archivos, que es el LABELED/NOMBRE y eso lo guardo en una lista
file_names = []
for name in os.listdir(folder_path):
    file_path = f"{folder_path}/{name}"
    file_names.append(file_path)
#Ahora tengo que especificar mis features 
signals = ["accX", "accY", "accZ", "gyroX", "gyroY", "gyroZ", "magnX", "magnY", "magnZ", "linAccX", "linAccY", "linAccZ"]
#Indico mi target
target_exercise="SQUAT"

In [None]:
len(file_names)

In [None]:
data,wk = load_training_data(filelist=file_names,
                         signals= signals,
                          target_exercise=target_exercise, other_exercises=[], is_peak_minima=True)

In [None]:
#df2=build_dataframe(data)

In [None]:
#df2.head()

In [None]:
data[0]

In [None]:
#Revisamos
#data
len(data)

In [None]:
data[1]['target']

In [None]:
df = merge_data(data)
len(df)

In [None]:
df.head()

In [None]:
df.columns

Ahora voy a convertir esto en un problema de clasificacion binario; para ello, voy a crear una columna de exercise, en la que segun la columna `exercising_periods` me diga si hay o no un squad. Si esa columna tiene un cero, indica que no hay squad, de lo contrario es un squad.

In [None]:
def nueva_columna(exercise):
    if exercise == 0:
        return 'no exercise'
    else:
        return 'squad'
df['squad'] = df['exercising_periods'].apply(nueva_columna)

In [None]:
df.head()

In [None]:
df.info()

# EDA

In [None]:
#revisar porcentaje de valores nulos por columnas
((df.isnull().sum())/len(df))*100

In [None]:
sns.countplot(x=df['squad'], label = "squad")

In [None]:
df.index

In [None]:
fig = px.line(df, x=df.index, y='linAccZ', title='Time serie of exercise linAccZ',color='squad')
fig.show()

In [None]:
fig = px.box(df, y="linAccZ", color="squad",title = 'Distribution of linAccZ vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="linAccY", color="squad",title = 'Distribution of linAccY vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="linAccX", color="squad",title = 'Distribution of linAccX vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accZ_mod", color="squad",title = 'Distribution of accZ_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accX_mod", color="squad",title = 'Distribution of accX_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
fig = px.box(df, y="accY_mod", color="squad",title = 'Distribution of accY_mod vs target variable')
fig.update_traces(quartilemethod="exclusive") 
fig.show()

In [None]:
timeseries, labels_ranges = load_timeseries_data(filelist=file_names, signals=signals, is_peak_minima=True)

In [None]:
timeseries.columns

In [None]:
plot_labeled_sequence(timeseries, labels_ranges[0:5])

In [None]:
labels_ranges

In [None]:
y = df['exercising_periods']

num_lags=24 # to discuss

plt.plot(y)
plt.show()
plot_acf(y,lags=num_lags)
plt.show()
plot_pacf(y,lags=num_lags,method="ols")
plt.show()

In [None]:
import statsmodels as sm

# Ho: the process is not stationary. We need, at least, a unit root
# H1: the process is stationary. We have to check different models (lags)


adf_test = sm.tsa.stattools.adfuller(y,maxlag=10) 
# print("adf_test", adf_test) # first output "The test statistic", second output "p-value"
print("ADF test for the original series")
print("Statistic Value:" , adf_test[0])
print("p-value:" , adf_test[1])
# print(sm.__version__)

In [None]:
prim_1000 = df.head(1000)

In [None]:
fig = px.line(prim_1000, x=prim_1000.index, y='linAccZ', title='Time serie of exercise linAccZ')
fig.show()

In [None]:
siguientes_2000 = df.tail(len(df) - 1000).head(1500)

In [None]:
fig = px.line(siguientes_2000, x=siguientes_2000.index, y='linAccZ', title='Time serie of exercise linAccZ')
fig.show()

# Data prep

In [None]:
df = merge_data(data)
len(df)

In [None]:
serie = pd.concat([item['series'] for item in data])

In [None]:
serie.head()

In [None]:
len(serie)

In [None]:
frames = []
target = []
for item in data:
    frames.append(item['series'])
    target.append(item['target'])

df = pd.concat(frames)
#df['target'] = target

In [None]:
len(frames)

In [None]:
len(target)

In [None]:
Counter(target)

In [None]:
#vamos a crear un dataframe con los promedios
promedios_df = pd.DataFrame()

# Iterar sobre cada diccionario en la lista
for diccionario in data:
    # Convertir el diccionario a un DataFrame
    df = diccionario['series']

    # Calcular el promedio de cada columna y agregarlo al DataFrame de promedios
    promedio_serie = df.mean()
    promedios_df = promedios_df.append(promedio_serie,ignore_index=True)

# Agregar la columna 'target' al DataFrame de promedios
promedios_df['target'] = [diccionario['target'] for diccionario in data]

In [None]:
#verificamos la longitud del dataframe que coincida con los 3805
len(promedios_df)

In [None]:
#verificamos que el target siga desbalanceado y coincidan los numeros
promedios_df.groupby('target').size()

In [None]:
promedios_df.info()

In [None]:
promedios_df.head()

In [None]:
sns.countplot(x=promedios_df['target'], label = "squad")

In [None]:
fig = px.box(promedios_df, y="linAccZ", color="target",title = 'Distribution of linAccZ vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccY", color="target",title = 'Distribution of linAccY vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="linAccX", color="target",title = 'Distribution of linAccX vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accZ_mod", color="target",title = 'Distribution of accZ_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accX_mod", color="target",title = 'Distribution of accX_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(promedios_df, y="accY_mod", color="target",title = 'Distribution of accY_mod vs target variable')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
sns.pairplot(promedios_df, hue= 'target',vars=["linAccX", "linAccY", "linAccZ"])

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap( promedios_df.corr(), annot = True, cmap ="coolwarm", linewidths = .5)

In [None]:
X = promedios_df.drop('target',axis=1)
y = promedios_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99,stratify=y)
display(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
selected = SelectKBest(score_func=f_classif, k=15)
selected.fit(X_train, y_train)

In [None]:
selected.get_feature_names_out()

In [None]:
scores = pd.Series(selected.scores_, index=X.columns)
scores = scores.sort_values(ascending=False)
px.bar(scores, template="none", title="F-Score of features with casual as dependent variable")

In [None]:
# Scale Data
scaler = StandardScaler()
X_selected_features_casual = scaler.fit_transform(X_selected_features_casual)

In [None]:
# ¡¡¡¡¡¡¡¡¡¡NOOOOOOOOOO LO CORRAS QUE SE EXPLOTA!!!!!!!!!!!
target = list()
frames = list()
for i in data:
    target=i['target']
    for j in data:
        frames.append(j['series'])
    df = pd.concat(frames,axis = 1)
    df['target'] = target