In [1]:
import pandas as pd
import numpy as np
from utilities import Data_cleaning

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.express.colors import sample_colorscale
from IPython.display import clear_output

import math
import copy
%matplotlib inline

pd.set_option('display.max_rows', 500)

In [99]:
df_demographic = pd.read_hdf('../data/flattened_database_merged_with_session_executions_v02.h5',  key='data')
df_demographic.index.names = ["session_execution_id"]
df_demographic.replace(np.nan, 0, inplace=True)

In [16]:
df = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')


In [25]:
df_demographic.shape, df.shape

((51355, 10), (46024, 8677))

In [23]:
# bmi calculation
df_demographic['users_bmi'] = df_demographic['users_weight'] / (df_demographic['users_height'] / 100) ** 2

columns = ['user_programs_user_id', 
           'users_date_of_birth', 
           'users_activity_level', 
           'users_gender', 
           'users_body_type', 'users_weight', 'users_height', 'users_body_fat', 'users_goal', 'users_bmi']

df_demographic = df_demographic[columns]

In [89]:
# merge demographic data with the main dataset by index

df_merged = df.merge(df_demographic,
                                  how ='inner',
                                  left_index = True,
                                  right_index= True)

In [91]:
df_users = df_merged.drop_duplicates(subset=['user_programs_user_id_x'], keep='first')

In [92]:
columns = ['user_programs_user_id_x', 
           'users_date_of_birth', 
           'users_activity_level', 
           'users_gender', 
           'users_body_type', 'users_weight', 'users_height', 'users_body_fat', 'users_goal', 'users_bmi']


df_users = df_users[columns]

# reassing the index to the user_programs_user_id_x

df_users.set_index('user_programs_user_id_x', inplace=True)
df_users

Unnamed: 0_level_0,users_date_of_birth,users_activity_level,users_gender,users_body_type,users_weight,users_height,users_body_fat,users_goal,users_bmi
user_programs_user_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
108,1984-08-29,2,False,0,68.000000,164.00,25.0,0,25.282570
112,1978-02-18,2,True,0,73.000000,178.00,20.0,0,23.040020
144,2020-11-30,1,True,0,60.000000,160.00,25.0,0,23.437500
172,1960-12-31,1,True,1,92.000000,179.00,45.0,0,28.713211
182,2000-01-01,1,True,0,60.000000,160.00,15.0,0,23.437500
...,...,...,...,...,...,...,...,...,...
18127,1975-06-07,1,False,0,58.000000,160.00,20.0,1,22.656250
18147,1980-09-20,2,False,0,55.000000,149.00,20.0,0,24.773659
18157,2001-06-21,2,True,2,98.000000,172.00,20.0,0,33.126014
18165,1985-12-13,1,False,0,66.000000,162.00,20.0,1,25.148605


In [93]:
# add the number of sessions to the user dataframe
df_users['number_of_sessions'] = df_merged['user_programs_user_id_x'].value_counts()

In [94]:
# add Duration of “activity” (time from session 1 to last session)

# Sum of time of all sessions by user
duration_of_activity = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].sum()

# Merge the duration of activity with the user dataframe
df_users = df_users.merge(duration_of_activity,
                          how ='inner',
                          left_index = True,
                          right_index= True)


In [95]:
# Add the duration of sessions

# calculate the duration of sessions
duration_of_sessions = df_merged.groupby('user_programs_user_id_x')['sum_of_time'].agg(list)

# merge the duration of sessions with the user dataframe
df_users = df_users.merge(duration_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

df_users.rename(columns={'sum_of_time_x': 'duration_of_activity', 'sum_of_time_y': 'duration_of_sessions'}, inplace=True)
