In [147]:
import pandas as pd
import numpy as np
from utilities import Data_cleaning

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.express.colors import sample_colorscale
from IPython.display import clear_output

import math
import copy
%matplotlib inline

pd.set_option('display.max_rows', 500)

In [148]:
df_demographic = pd.read_hdf('../data/flattened_database_merged_with_session_executions_v02.h5',  key='data')
df_demographic.index.names = ["session_execution_id"]
df_demographic.replace(np.nan, 0, inplace=True)

df = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')
df_demographic.shape, df.shape

((51355, 9186), (46024, 8677))

In [127]:
# bmi calculation
df_demographic_filtered = copy.copy(df_demographic)
df_demographic_filtered['users_bmi'] = df_demographic['users_weight'] / (df_demographic['users_height'] / 100) ** 2

df_demographic_columns = ['user_programs_user_id',
                            'users_created_at',
                            'users_date_of_birth', 
                            'users_activity_level', 
                            'users_gender', 
                            'users_body_type', 
                            'users_weight', 
                            'users_height', 
                            'users_body_fat', 
                            'users_goal', 
                            'users_bmi',
                            'session_executions_difficulty_feedback',
                            'session_executions_enjoyment_feedback']

df_demographic_filtered = df_demographic_filtered[df_demographic_columns]

In [149]:
# merge demographic data with the main dataset by index

df_merged = df.merge(df_demographic_filtered,
                                  how ='inner',
                                  left_index = True,
                                  right_index= True)

In [165]:
df_merged

Unnamed: 0,session_executions_updated_at,user_programs_user_id_x,1 leg bridge (left)_reps_1,1 leg bridge (left)_reps_10,1 leg bridge (left)_reps_11,1 leg bridge (left)_reps_12,1 leg bridge (left)_reps_13,1 leg bridge (left)_reps_14,1 leg bridge (left)_reps_15,1 leg bridge (left)_reps_16,...,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi,session_executions_difficulty_feedback,session_executions_enjoyment_feedback,month_of_sessions
3737,2021-06-11 18:00:35.640406,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,68.000000,164.00,25.0,0,25.282570,2.0,3.0,6
5830,2021-10-30 16:51:09.117908,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,68.000000,164.00,25.0,0,25.282570,3.0,3.0,10
17048,2021-12-08 19:19:10.266601,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,68.000000,164.00,25.0,0,25.282570,7.0,4.0,12
17351,2021-12-09 19:13:22.047997,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,68.000000,164.00,25.0,0,25.282570,2.0,2.0,12
715926,2022-01-24 11:38:17.767057,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,68.000000,164.00,25.0,0,25.282570,5.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746183,2022-05-25 20:36:55.438881,18157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,2,98.000000,172.00,20.0,0,33.126014,5.0,3.0,5
746178,2022-05-25 20:12:56.904269,18165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,66.000000,162.00,20.0,1,25.148605,5.0,3.0,5
746199,2022-05-26 00:49:19.809268,18165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,66.000000,162.00,20.0,1,25.148605,5.0,3.0,5
746468,2022-05-27 00:56:28.806426,18165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,0,66.000000,162.00,20.0,1,25.148605,5.0,3.0,5


In [None]:
df_users = df_merged.drop_duplicates(subset=['user_programs_user_id_x'], keep='last')

In [151]:
user_columns = ['user_programs_user_id_x',
           'users_created_at',
           'session_executions_updated_at',
           'users_date_of_birth', 
           'users_activity_level', 
           'users_gender', 
           'users_body_type', 'users_weight', 'users_height', 'users_body_fat', 'users_goal', 'users_bmi']


df_users = df_users[user_columns]

# reassing the index to the user_programs_user_id_x

df_users.set_index('user_programs_user_id_x', inplace=True)
df_users

Unnamed: 0_level_0,users_created_at,session_executions_updated_at,users_date_of_birth,users_activity_level,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi
user_programs_user_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
108,2020-11-21 11:02:22.366198,2022-04-10 10:17:40.613564,1984-08-29,2,False,0,68.000000,164.00,25.0,0,25.282570
112,2020-11-22 22:24:45.117192,2022-01-07 18:08:55.844447,1978-02-18,2,True,0,73.000000,178.00,20.0,0,23.040020
144,2020-11-30 15:05:58.467008,2020-12-10 12:02:56.629641,2020-11-30,1,True,0,60.000000,160.00,25.0,0,23.437500
172,2020-12-07 16:42:33.327819,2021-11-26 06:37:04.309295,1960-12-31,1,True,1,92.000000,179.00,45.0,0,28.713211
182,2020-12-11 11:34:00.850343,2020-12-11 14:17:15.877294,2000-01-01,1,True,0,60.000000,160.00,15.0,0,23.437500
...,...,...,...,...,...,...,...,...,...,...,...
18127,2022-05-24 13:08:58.940710,2022-05-25 04:40:12.028136,1975-06-07,1,False,0,58.000000,160.00,20.0,1,22.656250
18147,2022-05-25 07:12:53.616277,2022-05-25 16:15:23.597621,1980-09-20,2,False,0,55.000000,149.00,20.0,0,24.773659
18157,2022-05-25 16:36:21.636963,2022-05-25 20:36:55.438881,2001-06-21,2,True,2,98.000000,172.00,20.0,0,33.126014
18165,2022-05-25 19:29:50.757574,2022-05-27 00:56:28.806426,1985-12-13,1,False,0,66.000000,162.00,20.0,1,25.148605


In [168]:
# calculate the first training date for each user
first_training_date = df_merged.drop_duplicates(subset=['user_programs_user_id_x'], keep='first')[['session_executions_updated_at', 'user_programs_user_id_x']]

# Reassign the index to the user_programs_user_id_x
first_training_date.set_index('user_programs_user_id_x', inplace=True)

# merge the first training date with the user data
df_users = df_users.merge(first_training_date, how='inner', left_index=True, right_index=True)

# rename the columns
df_users.rename(columns={'session_executions_updated_at_x': 'Last_training_date', 
                         'session_executions_updated_at_y': 'first_training_date'}, inplace=True)

In [152]:
# add the number of sessions to the user dataframe
df_users['number_of_sessions'] = df_merged['user_programs_user_id_x'].value_counts()

In [153]:
# add Duration of “activity” (time from session 1 to last session)

# Sum of time of all sessions by user
duration_of_activity = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].sum()

# Merge the duration of activity with the user dataframe
df_users = df_users.merge(duration_of_activity,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# Rename the columns
df_users.rename(columns={'sum_of_time_x': 'duration_of_activity', 'sum_of_time_y': 'duration_of_sessions'}, inplace=True)



In [154]:
# Add the duration of sessions

# calculate the duration of sessions
duration_of_sessions = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].agg(list)

# merge the duration of sessions with the user dataframe
df_users = df_users.merge(duration_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)


In [156]:
# get month of the sessions
df_merged['month_of_sessions'] = df_merged['session_executions_updated_at'].dt.month

# get the list of the months of the sessions
month_of_sessions = df_merged.groupby('user_programs_user_id_x')['month_of_sessions'].agg(list)

# rename month of sessions from numbers to names

months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

month_of_sessions = month_of_sessions.apply(lambda x: [months[i] for i in x])

# merge the month of sessions with the user dataframe
df_users = df_users.merge(month_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [157]:
# session_executions_difficulty_feedback and session_executions_enjoyment_feedback

# get the mean of the session_executions_difficulty_feedback and session_executions_enjoyment_feedback
session_executions_difficulty_feedback = df_merged.groupby('user_programs_user_id_x')['session_executions_difficulty_feedback'].mean()

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_difficulty_feedback
session_executions_difficulty_feedback_list = df_merged.groupby('user_programs_user_id_x')['session_executions_difficulty_feedback'].agg(list)

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the mean of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback = df_merged.groupby('user_programs_user_id_x')['session_executions_enjoyment_feedback'].mean()

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback_list = df_merged.groupby('user_programs_user_id_x')['session_executions_enjoyment_feedback'].agg(list)

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [158]:
# rename columns
df_users.rename(columns={'session_executions_difficulty_feedback_x': 'mean_difficulty_feedback', 
                        'session_executions_difficulty_feedback_y': 'difficulty_feedback', 
                        'session_executions_enjoyment_feedback_x': 'mean_enjoyment_feedback', 
                        'session_executions_enjoyment_feedback_y': 'enjoyment_feedback'}, inplace=True)


In [172]:
# Calculate the duration from the first session to the last session exucuted

df_users['duration_from_first_to_last_session'] = df_users['Last_training_date'] - df_users['first_training_date']

In [224]:
df_users["duration_from_first_to_last_session"]

user_programs_user_id_x
108     302 days 16:17:04.973158
112      90 days 22:29:04.250765
144       3 days 00:57:14.939427
172      12 days 20:25:21.952077
182              0 days 00:00:00
                  ...           
18127            0 days 00:00:00
18147            0 days 00:00:00
18157            0 days 00:00:00
18165     1 days 04:43:31.902157
18174            0 days 00:00:00
Name: duration_from_first_to_last_session, Length: 3191, dtype: timedelta64[ns]

In [223]:
# Small test to corroborate that all the users have at least one training session

'''
df2 = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')
df2.drop(columns=['session_executions_updated_at', 'session_executions_updated_at', 'user_programs_user_id', 'date', 'session_executions_summary_total_kcal', 'session_executions_summary_effort', 'session_executions_summary_points', 'session_executions_summary_value_of_session'], inplace=True)

# count the zero values for a rows
df2['zero_values'] = df2.apply(lambda x: x.value_counts().get(0, 0), axis=1)
print(df2['zero_values'].value_counts().keys().sort_values(ascending=False))
print(df2.shape)
'''
# In the precious prints we can observe that all the rows have at least 5 values, which means that at least one training session was executed.

"\ndf2 = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')\ndf2.drop(columns=['session_executions_updated_at', 'session_executions_updated_at', 'user_programs_user_id', 'date', 'session_executions_summary_total_kcal', 'session_executions_summary_effort', 'session_executions_summary_points', 'session_executions_summary_value_of_session'], inplace=True)\n\n# count the zero values for a rows\ndf2['zero_values'] = df2.apply(lambda x: x.value_counts().get(0, 0), axis=1)\nprint(df2['zero_values'].value_counts().keys().sort_values(ascending=False))\nprint(df2.shape)\n"