# Import and paths

In [None]:
import numpy as np
import pandas as pd
import sklearn.ensemble as ske
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
# PATHS
prep_path = 'prep_dataset.parquet'
train_path = 'train_set_not_std.parquet'
test_path = '   test_set_not_std.parquet'

NON AUGMENTED DATASET

In [None]:
df = pd.read_parquet(prep_path, engine='pyarrow')

AUGMENTED DATASET

In [None]:
train_set_df = pd.read_parquet(train_path, engine='pyarrow')
test_set_df = pd.read_parquet(test_path, engine='pyarrow')

# Data Exploration

In [None]:
exercises = df[['Exercise', 'Set']].drop_duplicates()
exercises = exercises.groupby('Exercise')['Set'].agg(lambda x: x.tolist()).reset_index()
exercises['Set'] = exercises['Set'].apply(lambda x: [x[0]] + sorted(x[1:]))
exercises

Unnamed: 0,Exercise,Set
0,Abduction,"[Correct, A, B, C]"
1,Bird,"[Correct, A, B, C, D, E, F]"
2,Bridge,"[Correct, A, B, C, D, E, F]"
3,Knee,"[Correct, A, B]"
4,Shoulder,"[Correct, A, B, C, D, E, F]"
5,Squat,"[Correct, A, B, C]"
6,Stretch,"[Correct, A, B, C, D, E]"


In [None]:
#mapping
exercise_set_mapping = {}
for i, row in exercises.iterrows():
    for element in row['Set']:
        exercise_set_mapping[(row['Exercise'], element)] = len(exercise_set_mapping)
exercise_set_mapping

{('Abduction', 'Correct'): 0,
 ('Abduction', 'A'): 1,
 ('Abduction', 'B'): 2,
 ('Abduction', 'C'): 3,
 ('Bird', 'Correct'): 4,
 ('Bird', 'A'): 5,
 ('Bird', 'B'): 6,
 ('Bird', 'C'): 7,
 ('Bird', 'D'): 8,
 ('Bird', 'E'): 9,
 ('Bird', 'F'): 10,
 ('Bridge', 'Correct'): 11,
 ('Bridge', 'A'): 12,
 ('Bridge', 'B'): 13,
 ('Bridge', 'C'): 14,
 ('Bridge', 'D'): 15,
 ('Bridge', 'E'): 16,
 ('Bridge', 'F'): 17,
 ('Knee', 'Correct'): 18,
 ('Knee', 'A'): 19,
 ('Knee', 'B'): 20,
 ('Shoulder', 'Correct'): 21,
 ('Shoulder', 'A'): 22,
 ('Shoulder', 'B'): 23,
 ('Shoulder', 'C'): 24,
 ('Shoulder', 'D'): 25,
 ('Shoulder', 'E'): 26,
 ('Shoulder', 'F'): 27,
 ('Squat', 'Correct'): 28,
 ('Squat', 'A'): 29,
 ('Squat', 'B'): 30,
 ('Squat', 'C'): 31,
 ('Stretch', 'Correct'): 32,
 ('Stretch', 'A'): 33,
 ('Stretch', 'B'): 34,
 ('Stretch', 'C'): 35,
 ('Stretch', 'D'): 36,
 ('Stretch', 'E'): 37}

In [None]:
ex_by_par = df[['Participant', 'Exercise', 'Set']].drop_duplicates()
ex_by_par['Exercise_Set'] = ex_by_par.apply(lambda row: exercise_set_mapping.get((row['Exercise'], row['Set']), None), axis=1)
ex_by_par = ex_by_par.groupby('Participant')['Exercise_Set'].agg(list).reset_index()
ex_by_par['Exercise_Set'] = ex_by_par['Exercise_Set'].apply(sorted)
ex_by_par

Unnamed: 0,Participant,Exercise_Set
0,P04,"[0, 1, 2, 4, 5, 6, 11, 12, 13, 18, 19, 20, 21,..."
1,P05,"[0, 1, 3, 4, 7, 8, 11, 14, 15, 18, 19, 20, 21,..."
2,P06,"[0, 2, 3, 4, 9, 10, 11, 12, 13, 18, 19, 20, 21..."
3,P07,"[0, 1, 2, 4, 5, 6, 11, 14, 15, 18, 19, 20, 21,..."
4,P08,"[0, 1, 3, 4, 7, 8, 11, 16, 17, 18, 19, 20, 21,..."
5,P09,"[0, 2, 3, 4, 5, 6, 11, 14, 15, 18, 19, 20, 21,..."
6,P10,"[0, 1, 2, 4, 7, 8, 11, 16, 17, 18, 19, 20, 21,..."
7,P11,"[0, 1, 3, 4, 5, 9, 11, 14, 15, 18, 19, 20, 21,..."
8,P12,"[0, 2, 3, 4, 7, 8, 11, 16, 17, 18, 19, 20, 21,..."
9,P13,"[0, 1, 2, 4, 5, 6, 11, 15, 16, 18, 19, 20, 21,..."


In [None]:
def greedy_set_cover(universe, sets):
    uncovered_elements = set(universe)
    selected_sets = []

    while uncovered_elements:
        # Find the element that covers the maximum number of uncovered elements
        best_set = max(sets, key=lambda s: len(s & uncovered_elements))

        # Update the selected sets and uncovered elements
        selected_sets.append(best_set)
        uncovered_elements -= best_set

    return selected_sets


sets_of_ex_by_par = [set(lst) for lst in ex_by_par['Exercise_Set'].values]

all_exercises_set = list(range(len(exercise_set_mapping)))

result = greedy_set_cover(all_exercises_set, sets_of_ex_by_par)

#get participants for exercise_sets in result
result = [ex_by_par[ex_by_par['Exercise_Set'].apply(lambda x: set(x) == s)]['Participant'].values for s in result]
result

[array(['P04'], dtype=object),
 array(['P05'], dtype=object),
 array(['P12'], dtype=object),
 array(['P06'], dtype=object),
 array(['P09'], dtype=object)]

In [None]:
#invert the table, so that we have a list of participants for each exercise_set
exploded = ex_by_par.explode('Exercise_Set')
par_by_ex = exploded.groupby('Exercise_Set')['Participant'].agg(list).reset_index()
par_by_ex

Unnamed: 0,Exercise_Set,Participant
0,0,"[P04, P05, P06, P07, P08, P09, P10, P11, P12, ..."
1,1,"[P04, P05, P07, P08, P10, P11, P13, P14, P16, ..."
2,2,"[P04, P06, P07, P09, P10, P12, P13, P15, P16, ..."
3,3,"[P05, P06, P08, P09, P11, P12, P14, P15, P17, ..."
4,4,"[P04, P05, P06, P07, P08, P09, P10, P11, P12, ..."
5,5,"[P04, P07, P09, P11, P13, P15, P17, P19, P21, ..."
6,6,"[P04, P07, P09, P13, P17, P19]"
7,7,"[P05, P08, P10, P12, P14, P16, P18, P20, P22, ..."
8,8,"[P05, P08, P10, P12, P14, P16, P18, P20, P22, ..."
9,9,"[P06, P11, P15, P21, P23, P25, P27]"


# Machine Learning

In [None]:
all_exercises= df['Exercise'].drop_duplicates().values # 7 exercises
all_participants_tags = df['Participant'].drop_duplicates().values # 25 participants

In [None]:
def split_dataset_with_augmentation(ex_num):
    """
    Split the augmented dataset into train and test set for a given exercise

    :param ex_num: exercise to be analysed (0-6)
    :return: the exercise name, the X_train, y_train, X_test, y_test
    """

    # Decide and select which exercise to be analyzed
    ex_analysed = all_exercises[ex_num]
    train_ex_analysed_df = train_set_df[train_set_df['Exercise'] == ex_analysed].copy()
    test_ex_analysed_df = test_set_df[test_set_df['Exercise'] == ex_analysed].copy()

    train_set = train_ex_analysed_df.groupby(
        ['Participant', 'Camera', 'Set'])
    test_set = test_ex_analysed_df.groupby(
        ['Participant', 'Camera', 'Set'])

    min_rows1 = min(group.shape[0] for _, group in train_set)
    min_rows2 = min(group.shape[0] for _, group in test_set)
    min_rows = min(min_rows1, min_rows2)

    train_set = {
        group_name: group.drop(columns=['Exercise', 'Set', 'Participant', 'Camera', 'time(s)', 'video_id',
                                        'encoded_exo_Abduction', 'encoded_exo_Bird', 'encoded_exo_Bridge',
                                        'encoded_exo_Knee', 'encoded_exo_Shoulder', 'encoded_exo_Squat',
                                        'encoded_exo_Stretch']).iloc[:min_rows].values
        for group_name, group in train_set}
    test_set = {
        group_name: group.drop(columns=['Exercise', 'Set', 'Participant', 'Camera', 'time(s)', 'video_id',
                                        'encoded_exo_Abduction', 'encoded_exo_Bird', 'encoded_exo_Bridge',
                                        'encoded_exo_Knee', 'encoded_exo_Shoulder', 'encoded_exo_Squat',
                                        'encoded_exo_Stretch']).iloc[:min_rows].values
        for group_name, group in test_set}

    #flatten each matrix in the list
    X_train = [x.flatten() for x in list(train_set.values())]
    y_train = [y[2] for y in train_set.keys()]
    #flatten each matrix in the list
    X_test = [x.flatten() for x in list(test_set.values())]
    y_test = [y[2] for y in test_set.keys()]
    return ex_analysed, X_train, y_train, X_test, y_test

In [None]:
def split_dataset_without_augmentation(exercise_nbr, participants_nbr):
    """
    Split the non-augmented dataset into train and test set for a given exercise
    :param exercise_nbr: exercise to be analysed (0-6)
    :param participants_nbr: number of participants to be included in the train set (1-24)
    :return: the exercise name, the X_train, y_train, X_test, y_test
    """

    # Decide which exercise to be analyzed
    ex_analysed = all_exercises[exercise_nbr]
    # Determine how many participants in the train_set (the rest will be in the test set)
    participant_split = all_participants_tags[:participants_nbr]
    ex_analysed_df = df[df['Exercise'] == ex_analysed]
    min_rows = min(group.shape[0] for _, group in ex_analysed_df.groupby(['Participant', 'Camera', 'Set']))
    train_set = ex_analysed_df[ex_analysed_df['Participant'].isin(participant_split)].groupby(
        ['Participant', 'Camera', 'Set'])
    test_set = ex_analysed_df[~ex_analysed_df['Participant'].isin(participant_split)].groupby(
        ['Participant', 'Camera', 'Set'])
    train_set = {
        group_name: group.drop(columns=['Exercise', 'Set', 'Participant', 'Camera', 'time(s)']).iloc[:min_rows].values
        for group_name, group in train_set}
    test_set = {
        group_name: group.drop(columns=['Exercise', 'Set', 'Participant', 'Camera', 'time(s)']).iloc[:min_rows].values
        for group_name, group in test_set}

    # put the values of dict X_train into a list
    #flatten each matrix in the list
    X_train = [x.flatten() for x in list(train_set.values())]
    y_train = [y[2] for y in train_set.keys()]
    # put the values of dict X_test into a list
    #flatten each matrix in the list
    X_test = [x.flatten() for x in list(test_set.values())]
    y_test = [y[2] for y in test_set.keys()]
    return ex_analysed, X_train, y_train, X_test, y_test

NON AUGMENTED RESULTS

In [31]:
accuracy_list = []
for i in range(7):
    ex_analysed, X_train, y_train, X_test, y_test = split_dataset_without_augmentation(i, 20)

    # Define the  models

    # model2 = xgb.XGBClassifier(n_estimators=10, max_depth=2, objective='binary:logistic', num_parallel_tree = 3)
    # model2 = ske.GradientBoostingClassifier(random_state=0, n_estimators=20, learning_rate=0.1, max_depth=3)
    # model2 = ske.VotingClassifier(estimators=[
        # ('gb', ske.GradientBoostingClassifier(random_state=0, n_estimators=20, learning_rate=0.1, max_depth=3, verbose = 10)),
        #  ('rf', ske.RandomForestClassifier(random_state=0, n_estimators=100, max_depth=50, bootstrap=True, verbose = 10))])
    # model2 = svm.SVC(kernel='linear', verbose=1)

    model2 = ske.RandomForestClassifier(random_state=1, n_estimators=100, max_depth=100, bootstrap=True, n_jobs=-1)

    # Create the GridSearchCV object
    # grid_search = GridSearchCV(model, {'n_estimators': [50, 100, 150], 'max_depth': [10, 50, 100, 150]}, cv=5, verbose=2, n_jobs=-1)
    # Fit the model with the training data
    # grid_search.fit(X_train, y_train)
    # Print the best hyperparameters
    # print("Best Hyperparameters:", grid_search.best_params_)
    # Access the best model directly if needed
    # best_model = grid_search.best_estimator_

    # Train the model on the entire training element
    model2.fit(X_train,y_train)

    # Make predictions on the test element
    y_pred = model2.predict(X_test)

    # calculate the accuracy, precision, recall, f1-score
    print('exercise: ', ex_analysed)
    # print(classification_report(y_test, y_pred))
    accuracy=round(accuracy_score(y_test, y_pred), 2)
    print('accuracy: ', accuracy)
    accuracy_list.append(accuracy)
    # print('precision: ', round(precision_score(y_test, y_pred, average='weighted'), 2))
    # print('recall: ', round(recall_score(y_test, y_pred, average='weighted'), 2))
    # print('f1-score: ', round(f1_score(y_test, y_pred, average='weighted'), 2))
    print('------------------------------------------------------')

print('global accuracy:', np.mean(accuracy_list))

exercise:  Abduction
accuracy:  0.52
------------------------------------------------------
exercise:  Bird
accuracy:  0.55
------------------------------------------------------
exercise:  Bridge
accuracy:  0.35
------------------------------------------------------
exercise:  Knee
accuracy:  0.58
------------------------------------------------------
exercise:  Shoulder
accuracy:  0.48
------------------------------------------------------
exercise:  Squat
accuracy:  0.6
------------------------------------------------------
exercise:  Stretch
accuracy:  0.6
------------------------------------------------------
global accuracy: 0.5257142857142857


AUGMENTED RESULT

In [35]:
accuracy_list = []
for i in range(7):
    ex_analysed, X_train, y_train, X_test, y_test = split_dataset_with_augmentation(i)

    # Define the  models

    # model2 = xgb.XGBClassifier(n_estimators=10, max_depth=2, objective='binary:logistic', num_parallel_tree = 3)
    # model2 = ske.GradientBoostingClassifier(random_state=0, n_estimators=20, learning_rate=0.1, max_depth=3)
    # model2 = ske.VotingClassifier(estimators=[
        # ('gb', ske.GradientBoostingClassifier(random_state=0, n_estimators=20, learning_rate=0.1, max_depth=3, verbose = 10)),
        #  ('rf', ske.RandomForestClassifier(random_state=0, n_estimators=100, max_depth=50, bootstrap=True, verbose = 10))])
    # model2 = svm.SVC(kernel='linear', verbose=1)

    model = ske.RandomForestClassifier(random_state=1, n_estimators=150, max_depth=20, bootstrap=True, n_jobs=-1)

    # Create the GridSearchCV object
    # grid_search = GridSearchCV(model, {'n_estimators': [100, 150, 200], 'max_depth': [10, 20, 40, 50]}, cv=5, verbose=2, n_jobs=-1)
    # Fit the model with the training data
    # grid_search.fit(X_train, y_train)
    # Print the best hyperparameters
    # print("Best Hyperparameters:", grid_search.best_params_)
    # Access the best model directly if needed
    # best_model = grid_search.best_estimator_


    # Train the model on the entire training element
    model.fit(X_train,y_train)

    # Make predictions on the test element
    y_pred = model.predict(X_test)

    # calculate the accuracy, precision, recall, f1-score
    print('exercise: ', ex_analysed)
    # print(classification_report(y_test, y_pred))
    accuracy=round(accuracy_score(y_test, y_pred), 2)
    print('accuracy: ', accuracy)
    accuracy_list.append(accuracy)
    # print('precision: ', round(precision_score(y_test, y_pred, average='weighted'), 2))
    # print('recall: ', round(recall_score(y_test, y_pred, average='weighted'), 2))
    # print('f1-score: ', round(f1_score(y_test, y_pred, average='weighted'), 2))
    print('------------------------------------------------------')

print('global accuracy:', np.mean(accuracy_list))

exercise:  Abduction
accuracy:  0.53
------------------------------------------------------
exercise:  Bird
accuracy:  0.48
------------------------------------------------------
exercise:  Bridge
accuracy:  0.33
------------------------------------------------------
exercise:  Knee
accuracy:  0.57
------------------------------------------------------
exercise:  Shoulder
accuracy:  0.53
------------------------------------------------------
exercise:  Squat
accuracy:  0.55
------------------------------------------------------
exercise:  Stretch
accuracy:  0.47
------------------------------------------------------
global accuracy: 0.4942857142857143
