In [1]:
import pandas as pd
import numpy as np
from utilities import Data_cleaning

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.express.colors import sample_colorscale
from IPython.display import clear_output

import math
import copy
%matplotlib inline

pd.set_option('display.max_rows', 500)

In [126]:
df_demographic = pd.read_hdf('../data/flattened_database_merged_with_session_executions_v02.h5',  key='data')
df_demographic.index.names = ["session_execution_id"]
df_demographic.replace(np.nan, 0, inplace=True)

df = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')
df_demographic.shape, df.shape

((51355, 9186), (46024, 8677))

In [127]:
# bmi calculation
df_demographic_filtered = copy.copy(df_demographic)
df_demographic_filtered['users_bmi'] = df_demographic['users_weight'] / (df_demographic['users_height'] / 100) ** 2

df_demographic_columns = ['user_programs_user_id',
                            'users_created_at',
                            'users_date_of_birth', 
                            'users_activity_level', 
                            'users_gender', 
                            'users_body_type', 
                            'users_weight', 
                            'users_height', 
                            'users_body_fat', 
                            'users_goal', 
                            'users_bmi',
                            'session_executions_difficulty_feedback',
                            'session_executions_enjoyment_feedback']

df_demographic_filtered = df_demographic_filtered[df_demographic_columns]

In [128]:
# merge demographic data with the main dataset by index

df_merged = df.merge(df_demographic_filtered,
                                  how ='inner',
                                  left_index = True,
                                  right_index= True)

In [129]:
df_users = df_merged.drop_duplicates(subset=['user_programs_user_id_x'], keep='last')

In [130]:
user_columns = ['user_programs_user_id_x',
           'users_created_at',
           'session_executions_updated_at',
           'users_date_of_birth', 
           'users_activity_level', 
           'users_gender', 
           'users_body_type', 'users_weight', 'users_height', 'users_body_fat', 'users_goal', 'users_bmi']


df_users = df_users[user_columns]

# reassing the index to the user_programs_user_id_x

df_users.set_index('user_programs_user_id_x', inplace=True)
df_users

Unnamed: 0_level_0,users_created_at,session_executions_updated_at,users_date_of_birth,users_activity_level,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi
user_programs_user_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
108,2020-11-21 11:02:22.366198,2022-04-10 10:17:40.613564,1984-08-29,2,False,0,68.000000,164.00,25.0,0,25.282570
112,2020-11-22 22:24:45.117192,2022-01-07 18:08:55.844447,1978-02-18,2,True,0,73.000000,178.00,20.0,0,23.040020
144,2020-11-30 15:05:58.467008,2020-12-10 12:02:56.629641,2020-11-30,1,True,0,60.000000,160.00,25.0,0,23.437500
172,2020-12-07 16:42:33.327819,2021-11-26 06:37:04.309295,1960-12-31,1,True,1,92.000000,179.00,45.0,0,28.713211
182,2020-12-11 11:34:00.850343,2020-12-11 14:17:15.877294,2000-01-01,1,True,0,60.000000,160.00,15.0,0,23.437500
...,...,...,...,...,...,...,...,...,...,...,...
18127,2022-05-24 13:08:58.940710,2022-05-25 04:40:12.028136,1975-06-07,1,False,0,58.000000,160.00,20.0,1,22.656250
18147,2022-05-25 07:12:53.616277,2022-05-25 16:15:23.597621,1980-09-20,2,False,0,55.000000,149.00,20.0,0,24.773659
18157,2022-05-25 16:36:21.636963,2022-05-25 20:36:55.438881,2001-06-21,2,True,2,98.000000,172.00,20.0,0,33.126014
18165,2022-05-25 19:29:50.757574,2022-05-27 00:56:28.806426,1985-12-13,1,False,0,66.000000,162.00,20.0,1,25.148605


In [131]:
# add the number of sessions to the user dataframe
df_users['number_of_sessions'] = df_merged['user_programs_user_id_x'].value_counts()

In [132]:
# add Duration of “activity” (time from session 1 to last session)

# Sum of time of all sessions by user
duration_of_activity = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].sum()

# Merge the duration of activity with the user dataframe
df_users = df_users.merge(duration_of_activity,
                          how ='inner',
                          left_index = True,
                          right_index= True)


In [133]:
# Add the duration of sessions

# calculate the duration of sessions
duration_of_sessions = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].agg(list)

# merge the duration of sessions with the user dataframe
df_users = df_users.merge(duration_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

df_users.rename(columns={'sum_of_time_x': 'duration_of_activity', 'sum_of_time_y': 'duration_of_sessions'}, inplace=True)


In [134]:
df_users

Unnamed: 0_level_0,users_created_at,session_executions_updated_at,users_date_of_birth,users_activity_level,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi,number_of_sessions,duration_of_activity,duration_of_sessions
user_programs_user_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
108,2020-11-21 11:02:22.366198,2022-04-10 10:17:40.613564,1984-08-29,2,False,0,68.000000,164.00,25.0,0,25.282570,8,16234.0,"[1016.0, 1535.0, 3754.0, 942.0, 3816.0, 1754.0..."
112,2020-11-22 22:24:45.117192,2022-01-07 18:08:55.844447,1978-02-18,2,True,0,73.000000,178.00,20.0,0,23.040020,2,492.0,"[56.0, 436.0]"
144,2020-11-30 15:05:58.467008,2020-12-10 12:02:56.629641,2020-11-30,1,True,0,60.000000,160.00,25.0,0,23.437500,2,40.0,"[9.0, 31.0]"
172,2020-12-07 16:42:33.327819,2021-11-26 06:37:04.309295,1960-12-31,1,True,1,92.000000,179.00,45.0,0,28.713211,5,5757.0,"[1213.0, 1562.0, 618.0, 1471.0, 893.0]"
182,2020-12-11 11:34:00.850343,2020-12-11 14:17:15.877294,2000-01-01,1,True,0,60.000000,160.00,15.0,0,23.437500,1,200.0,[200.0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18127,2022-05-24 13:08:58.940710,2022-05-25 04:40:12.028136,1975-06-07,1,False,0,58.000000,160.00,20.0,1,22.656250,1,1310.0,[1310.0]
18147,2022-05-25 07:12:53.616277,2022-05-25 16:15:23.597621,1980-09-20,2,False,0,55.000000,149.00,20.0,0,24.773659,1,400.0,[400.0]
18157,2022-05-25 16:36:21.636963,2022-05-25 20:36:55.438881,2001-06-21,2,True,2,98.000000,172.00,20.0,0,33.126014,1,739.0,[739.0]
18165,2022-05-25 19:29:50.757574,2022-05-27 00:56:28.806426,1985-12-13,1,False,0,66.000000,162.00,20.0,1,25.148605,3,4701.0,"[308.0, 2206.0, 2187.0]"


In [135]:
# get month of the sessions
df_merged['month_of_sessions'] = df_merged['session_executions_updated_at'].dt.month

# get the list of the months of the sessions
month_of_sessions = df_merged.groupby('user_programs_user_id_x')['month_of_sessions'].agg(list)

# rename month of sessions from numbers to names

months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

month_of_sessions = month_of_sessions.apply(lambda x: [months[i] for i in x])

# merge the month of sessions with the user dataframe
df_users = df_users.merge(month_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [136]:
# session_executions_difficulty_feedback and session_executions_enjoyment_feedback

# get the mean of the session_executions_difficulty_feedback and session_executions_enjoyment_feedback
session_executions_difficulty_feedback = df_merged.groupby('user_programs_user_id_x')['session_executions_difficulty_feedback'].mean()

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_difficulty_feedback
session_executions_difficulty_feedback_list = df_merged.groupby('user_programs_user_id_x')['session_executions_difficulty_feedback'].agg(list)

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the mean of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback = df_merged.groupby('user_programs_user_id_x')['session_executions_enjoyment_feedback'].mean()

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback_list = df_merged.groupby('user_programs_user_id_x')['session_executions_enjoyment_feedback'].agg(list)

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [140]:
# rename columns
df_users.rename(columns={'session_executions_difficulty_feedback_x': 'mean_difficulty_feedback', 
                        'session_executions_difficulty_feedback_y': 'difficulty_feedback', 
                        'session_executions_enjoyment_feedback_x': 'mean_enjoyment_feedback', 
                        'session_executions_enjoyment_feedback_y': 'enjoyment_feedback'}, inplace=True)


In [146]:
df_users

Unnamed: 0_level_0,users_created_at,session_executions_updated_at,users_date_of_birth,users_activity_level,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi,number_of_sessions,duration_of_activity,duration_of_sessions,month_of_sessions,mean_difficulty_feedback,difficulty_feedback,mean_enjoyment_feedback,enjoyment_feedback
user_programs_user_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
108,2020-11-21 11:02:22.366198,2022-04-10 10:17:40.613564,1984-08-29,2,False,0,68.000000,164.00,25.0,0,25.282570,8,16234.0,"[1016.0, 1535.0, 3754.0, 942.0, 3816.0, 1754.0...","[June, October, December, December, January, J...",4.25,"[2.0, 3.0, 7.0, 2.0, 5.0, 5.0, 5.0, 5.0]",3.0,"[3.0, 3.0, 4.0, 2.0, 3.0, 3.0, 3.0, 3.0]"
112,2020-11-22 22:24:45.117192,2022-01-07 18:08:55.844447,1978-02-18,2,True,0,73.000000,178.00,20.0,0,23.040020,2,492.0,"[56.0, 436.0]","[October, January]",5.00,"[4.0, 6.0]",3.0,"[2.0, 4.0]"
144,2020-11-30 15:05:58.467008,2020-12-10 12:02:56.629641,2020-11-30,1,True,0,60.000000,160.00,25.0,0,23.437500,2,40.0,"[9.0, 31.0]","[December, December]",0.00,"[0.0, 0.0]",0.0,"[0.0, 0.0]"
172,2020-12-07 16:42:33.327819,2021-11-26 06:37:04.309295,1960-12-31,1,True,1,92.000000,179.00,45.0,0,28.713211,5,5757.0,"[1213.0, 1562.0, 618.0, 1471.0, 893.0]","[November, November, November, November, Novem...",5.20,"[5.0, 6.0, 6.0, 4.0, 5.0]",2.8,"[2.0, 3.0, 4.0, 3.0, 2.0]"
182,2020-12-11 11:34:00.850343,2020-12-11 14:17:15.877294,2000-01-01,1,True,0,60.000000,160.00,15.0,0,23.437500,1,200.0,[200.0],[December],0.00,[0.0],0.0,[0.0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18127,2022-05-24 13:08:58.940710,2022-05-25 04:40:12.028136,1975-06-07,1,False,0,58.000000,160.00,20.0,1,22.656250,1,1310.0,[1310.0],[May],6.00,[6.0],5.0,[5.0]
18147,2022-05-25 07:12:53.616277,2022-05-25 16:15:23.597621,1980-09-20,2,False,0,55.000000,149.00,20.0,0,24.773659,1,400.0,[400.0],[May],3.00,[3.0],3.0,[3.0]
18157,2022-05-25 16:36:21.636963,2022-05-25 20:36:55.438881,2001-06-21,2,True,2,98.000000,172.00,20.0,0,33.126014,1,739.0,[739.0],[May],5.00,[5.0],3.0,[3.0]
18165,2022-05-25 19:29:50.757574,2022-05-27 00:56:28.806426,1985-12-13,1,False,0,66.000000,162.00,20.0,1,25.148605,3,4701.0,"[308.0, 2206.0, 2187.0]","[May, May, May]",5.00,"[5.0, 5.0, 5.0]",3.0,"[3.0, 3.0, 3.0]"
