# Importing Libraries

In [1]:
from fitizens_libraries.load_and_process_training_data import load_training_data
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pycaret.classification import *

# Creating Auxiliary Functions


In [2]:
def create_custom_dataframe(series):
    df =  series[['linAccX', 'linAccY', 'linAccZ', 'gyroX',
                'gyroY', 'gyroZ', 'accX_mod', 'accY_mod', 'accZ_mod', 'gyroX_mod',
                'gyroY_mod', 'gyroZ_mod', 'magnX_mod', 'magnY_mod', 'magnZ_mod',
                'linAccX_mod', 'linAccY_mod', 'linAccZ_mod']]
    return df

In [3]:
def create_training_data_stats(df, target):
    stats_dict = {}
    series = create_custom_dataframe(df)
    
    for column in series.columns:
        mean = series[column].mean()
        std = series[column].std()
        median = series[column].median()
        skewness = series[column].skew()
        kurtosis = series[column].kurtosis()

       # min_val = series[column].min()
      #  max_val = series[column].max()
      #  range_val = max_val - min_val
      #  quartile_25 = np.percentile(series[column], 25)
      #  quartile_75 = np.percentile(series[column], 75)
      #  iqr = quartile_75 - quartile_25
    

        stats_dict[f"{column}_mean"] = mean
        stats_dict[f"{column}_std"] = std
        stats_dict[f"{column}_median"] = median
        stats_dict[f"{column}_skewness"] = skewness
        stats_dict[f"{column}_kurtosis"] = kurtosis

       # stats_dict[f"{column}_min"] = min_val
       # stats_dict[f"{column}_max"] = max_val
       # stats_dict[f"{column}_range"] = range_val
      #  stats_dict[f"{column}_quartile_25"] = quartile_25
       # stats_dict[f"{column}_quartile_75"] = quartile_75
       # stats_dict[f"{column}_iqr"] = iqr
        
        
    stats_dict["target"] = target
    return stats_dict

# Loading Data

In [4]:
folder_path = "DUMBBELL"
os.makedirs(folder_path, exist_ok=True)
file_names = [f"{folder_path}/{name}" for name in os.listdir(folder_path)]
signals = ['accX', 'accY', 'accZ', 'gyroX', 'gyroY', 'gyroZ', 'magnX', 'magnY', 'magnZ', 'linAccX', 'linAccY', 'linAccZ']

data, wk = load_training_data(filelist=file_names,
                         signals= signals,
                          target_exercise="DUMBBELL_SNATCH", other_exercises=[], is_peak_minima=True)
#data[0]

In [5]:
from custom_libraries.merge_data import merge_data

In [6]:
df = merge_data(data)

In [None]:
df.columns

In [None]:
columns_to_keep = [
    'accX', 'accY', 'accZ', 'linAccX', 'linAccY', 'linAccZ',
    'gyroX', 'gyroY', 'gyroZ', 'accX_mod', 'accY_mod', 'accZ_mod',
    'gyroX_mod', 'gyroY_mod', 'gyroZ_mod', 'magnX_mod', 'magnY_mod',
    'magnZ_mod', 'linAccX_mod', 'linAccY_mod', 'linAccZ_mod'
]

# Remove columns not in the list of columns_to_keep
df = df[columns_to_keep]

In [None]:
df.columns

In [None]:
df.info()

# Creating new data

In [7]:
data_info = [create_training_data_stats(info["series"], info["target"]) for info in data]
data_custom = pd.DataFrame(data_info)
data_custom.head()

Unnamed: 0,linAccX_mean,linAccX_std,linAccX_median,linAccX_skewness,linAccX_kurtosis,linAccY_mean,linAccY_std,linAccY_median,linAccY_skewness,linAccY_kurtosis,...,linAccY_mod_std,linAccY_mod_median,linAccY_mod_skewness,linAccY_mod_kurtosis,linAccZ_mod_mean,linAccZ_mod_std,linAccZ_mod_median,linAccZ_mod_skewness,linAccZ_mod_kurtosis,target
0,0.541847,0.394069,0.660997,-0.777658,-0.580579,0.675534,0.894215,1.091777,-0.433954,-1.176486,...,1.076209,1.191977,0.781878,-0.235597,17.57836,16.02011,11.847352,0.715174,-0.802404,DUMBBELL_SNATCH
1,-0.074538,0.527332,-0.050235,-0.149851,-1.460313,0.598507,0.266881,0.603392,-0.319812,-0.255269,...,0.310353,0.364082,0.762078,0.068343,24.937133,20.632204,20.924743,0.545944,-0.920501,DUMBBELL_SNATCH
2,0.088198,0.248036,0.165647,-1.345887,0.831077,1.149927,0.703037,1.28998,-0.218163,-1.234202,...,1.56792,1.664051,0.466006,-1.045808,28.171815,22.57296,25.392311,0.495648,-0.933726,DUMBBELL_SNATCH
3,-0.302354,0.330763,-0.240866,0.054734,-0.443915,0.529161,0.506785,0.667612,-1.315278,0.475623,...,0.298021,0.445706,0.461241,-0.474601,25.334828,22.431922,20.808947,0.743078,-0.650686,DUMBBELL_SNATCH
4,-0.179401,0.254849,-0.123012,-0.11726,-0.296821,0.870408,0.421691,0.949073,-0.594809,-0.409245,...,0.659636,0.900739,0.343322,-0.805311,15.98395,15.132598,11.735994,0.720037,-0.779917,DUMBBELL_SNATCH


# EDA

In [None]:
data_custom.shape

In [None]:
data_custom.isnull().sum()

In [None]:
data_custom.target.value_counts()

In [None]:
corr_matrix = data_custom.drop(columns=["target"],axis=1).corr()
corr_matrix

In [None]:
# Create a figure and a set of subplots
df_box = data_custom.drop(columns=["target"],axis=1)
fig, axes = plt.subplots(nrows=len(df_box.columns), ncols=1, figsize=(8, 4 * len(df_box.columns)))

# Create a box plot for each column
for i, col in enumerate(df_box.columns):
    sns.boxplot(x=df_box[col], ax=axes[i])

# Add a title (optional)
fig.suptitle('Box Plots', fontsize=16)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()

# ML with Pycaret

In [8]:
data_dev = data_custom.sample(frac=0.95, random_state=786)
data_prod = data_custom.drop(data_dev.index)

data_dev.reset_index(inplace=True, drop=True)
data_prod.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data_dev.shape))
print('Simulated data For Production ' + str(data_prod.shape))

Data for Modeling: (674, 91)
Simulated data For Production (36, 91)


In [None]:
data_dev

In [9]:
model = setup(
    # Basic options
    data = data_dev,
    target = "target",
    train_size = 0.7,
    preprocess = True,
    
    # Dealing with multicollinearity
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.9,
        
    # Feature normalization with outliers
    normalize = True,
    normalize_method = 'robust',
        
    # Paralellization options
    n_jobs = - 1,
    use_gpu = False,
    
    # Imbalance Dataset
    fix_imbalance=True,
    
    # Feature Importance
    feature_selection = True,
    n_features_to_select= 10
)

[LightGBM] [Info] Number of positive: 377, number of negative: 377
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15060
[LightGBM] [Info] Number of data points in the train set: 754, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Description,Value
0,Session id,7601
1,Target,target
2,Target type,Binary
3,Target mapping,"DUMBBELL_SNATCH: 0, NO_EXERCISE: 1"
4,Original data shape,"(674, 91)"
5,Transformed data shape,"(957, 11)"
6,Transformed train set shape,"(754, 11)"
7,Transformed test set shape,"(203, 11)"
8,Numeric features,90
9,Preprocess,True


In [10]:
models = compare_models(sort="F1", fold=2)
models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9957,0.9999,0.9957,0.9957,0.9957,0.9867,0.9867,0.335
xgboost,Extreme Gradient Boosting,0.9915,0.9989,0.9915,0.9917,0.9915,0.9734,0.9738,0.2
ada,Ada Boost Classifier,0.9894,0.9983,0.9894,0.9897,0.9894,0.9672,0.9674,0.215
lightgbm,Light Gradient Boosting Machine,0.9894,0.9985,0.9894,0.9894,0.9894,0.9668,0.9669,0.22
rf,Random Forest Classifier,0.9873,0.9999,0.9873,0.9879,0.9874,0.961,0.9616,2.63
catboost,CatBoost Classifier,0.9872,0.9992,0.9872,0.9878,0.9874,0.9609,0.9613,2.35
nb,Naive Bayes,0.9809,0.9985,0.9809,0.9821,0.9811,0.9417,0.9429,2.38
gbc,Gradient Boosting Classifier,0.9809,0.9948,0.9809,0.9816,0.981,0.9413,0.9418,0.285
qda,Quadratic Discriminant Analysis,0.9809,0.9996,0.9809,0.9811,0.9806,0.9385,0.9396,2.365
dt,Decision Tree Classifier,0.9745,0.9601,0.9745,0.9749,0.9744,0.9199,0.9206,2.325


In [11]:
cnt_models_df = pull()
cnt_models_df

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9957,0.9999,0.9957,0.9957,0.9957,0.9867,0.9867,0.335
xgboost,Extreme Gradient Boosting,0.9915,0.9989,0.9915,0.9917,0.9915,0.9734,0.9738,0.2
ada,Ada Boost Classifier,0.9894,0.9983,0.9894,0.9897,0.9894,0.9672,0.9674,0.215
lightgbm,Light Gradient Boosting Machine,0.9894,0.9985,0.9894,0.9894,0.9894,0.9668,0.9669,0.22
rf,Random Forest Classifier,0.9873,0.9999,0.9873,0.9879,0.9874,0.961,0.9616,2.63
catboost,CatBoost Classifier,0.9872,0.9992,0.9872,0.9878,0.9874,0.9609,0.9613,2.35
nb,Naive Bayes,0.9809,0.9985,0.9809,0.9821,0.9811,0.9417,0.9429,2.38
gbc,Gradient Boosting Classifier,0.9809,0.9948,0.9809,0.9816,0.981,0.9413,0.9418,0.285
qda,Quadratic Discriminant Analysis,0.9809,0.9996,0.9809,0.9811,0.9806,0.9385,0.9396,2.365
dt,Decision Tree Classifier,0.9745,0.9601,0.9745,0.9749,0.9744,0.9199,0.9206,2.325


In [12]:
clf = create_model('et', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9958,1.0,0.9958,0.9959,0.9958,0.9868,0.9869
1,0.9915,0.9995,0.9915,0.9918,0.9916,0.9738,0.9742
Mean,0.9936,0.9998,0.9936,0.9938,0.9937,0.9803,0.9805
Std,0.0021,0.0002,0.0021,0.002,0.0021,0.0065,0.0064


In [13]:
tuned_clf = tune_model(clf, optimize = 'F1', fold = 2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9915,0.9997,0.9915,0.9919,0.9916,0.9739,0.9742
1,0.9915,1.0,0.9915,0.9918,0.9916,0.9738,0.9742
Mean,0.9915,0.9998,0.9915,0.9919,0.9916,0.9738,0.9742
Std,0.0,0.0002,0.0,0.0,0.0,0.0,0.0


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [14]:
print("Total of features: ", len(tuned_clf.feature_importances_))

Total of features:  10


In [15]:
evaluate_model(tuned_clf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [16]:
model_final = finalize_model(tuned_clf)

In [17]:
save_model(model_final, 'DUMBBELL')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['linAccX_mean', 'linAccX_std',
                                              'linAccX_median',
                                              'linAccX_skewness',
                                              'linAccX_kurtosis', 'linAccY_mean',
                                              'linAccY_std', 'linAccY_median',
                                              'linAccY_s...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                             

## REGULARITZATION

Documentation: https://catboost.ai/en/docs/concepts/python-reference_catboost

In [None]:
params_grid = {
    'iterations': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'depth': [3, 5, 7, 9],  # Regularization by controlling tree depth
}

In [None]:
# Tune the model with regularization and other hyperparameters
tuned_clf = tune_model(clf, custom_grid=params_grid, optimize='F1', fold=2)

In [None]:
#tuned_clf = tune_model(clf, optimize = 'F1', fold = 2)

In [None]:
print("Total of features: ", len(tuned_clf.feature_importances_))

In [None]:
evaluate_model(tuned_clf)

# Save Model

In [None]:
# We finalize model (we train on the whole set)
model_final = finalize_model(tuned_clf)

In [None]:
# Export the PyCaret model
save_model(model_final, 'DUMBBELL')

# Use in Production

In [None]:
# Load the model
pipeline = load_model(model_name="squats_traditional")

In [None]:
# Utilizar modelo
prediction = predict_model(pipeline, data_prod, raw_score=True)

In [None]:
prediction

# Scores

In [None]:
true_labels = prediction["target"]
predictions = prediction["prediction_label"]

In [None]:
from sklearn.metrics import roc_auc_score

predictions_prob = prediction["prediction_score_SQUAT"]
binary_true_labels = true_labels.map({"BRUPEE": 1, "NO_EXERCISE": 0})
auc_score = roc_auc_score(binary_true_labels, predictions_prob)

print(f"AUC: {auc_score}")