In [1]:
import pandas as pd
import numpy as np
from utilities import Data_cleaning

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.express.colors import sample_colorscale
from IPython.display import clear_output

import math
import copy
%matplotlib inline

pd.set_option('display.max_rows', 500)

In [2]:
df_demographic = pd.read_hdf('../data/flattened_database_merged_with_session_executions_v03_(andrea).h5',  key='data')
df_demographic.index.names = ["session_execution_id"]
df_demographic.replace(np.nan, 0, inplace=True)

df = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')
df_demographic.shape, df.shape

((51355, 9190), (46024, 8677))

In [3]:
# bmi calculation
df_demographic_filtered = copy.copy(df_demographic)
df_demographic_filtered['users_bmi'] = df_demographic['users_weight'] / (df_demographic['users_height'] / 100) ** 2

df_demographic_columns = ['user_programs_user_id',
                            'users_created_at',
                            'users_date_of_birth', 
                            'users_activity_level', 
                            'users_gender', 
                            'users_body_type', 
                            'users_weight', 
                            'users_height', 
                            'users_body_fat', 
                            'users_goal', 
                            'users_bmi',
                            'session_executions_difficulty_feedback',
                            'session_executions_enjoyment_feedback',
                            'programs_pro', 
                            'programs_name_en', 
                            'programs_description_en', 
                            'users_last_sign_in_at', 
                            'user_programs_active',
                            'user_programs_completed',
                            'users_newsletter_subscription']

df_demographic_filtered = df_demographic_filtered[df_demographic_columns]

In [4]:
# merge demographic data with the main dataset by index

df_merged = df.merge(df_demographic_filtered,
                                  how ='inner',
                                  left_index = True,
                                  right_index= True)

# Rename user_programs_user_id
df_merged = df_merged.rename(columns={'user_programs_user_id_x': 'user_programs_user_id'})

In [5]:
df_users = df_merged.drop_duplicates(subset=['user_programs_user_id'], keep='last')

In [6]:
#user_columns = ['user_programs_user_id',
#           'users_created_at',
#           'session_executions_updated_at',
#           'users_date_of_birth', 
#           'users_activity_level', 
#           'users_gender', 
#           'users_body_type', 'users_weight', 'users_height', 'users_body_fat', 'users_goal', 'users_bmi']

# Append session_executions_updated_at to the df_demographic_columns
df_demographic_columns.append('session_executions_updated_at')

df_users = df_users[df_demographic_columns]

# reassing the index to the user_programs_user_id

df_users.set_index('user_programs_user_id', inplace=True)

# rename the columns
df_users = df_users.rename(columns={'session_executions_updated_at': 'Last_training_date'})

In [7]:
# calculate the first training date for each user
first_training_date = df_merged.drop_duplicates(subset=['user_programs_user_id'], keep='first')[['session_executions_updated_at', 'user_programs_user_id']]

# Reassign the index to the user_programs_user_id
first_training_date.set_index('user_programs_user_id', inplace=True)

# merge the first training date with the user data
df_users = df_users.merge(first_training_date, how='inner', left_index=True, right_index=True)

df_users = df_users.rename(columns={'session_executions_updated_at': 'first_training_date'})

In [8]:
# add the number of sessions to the user dataframe
df_users['number_of_sessions'] = df_merged['user_programs_user_id'].value_counts()

In [9]:
# Add the duration of activity to the user dataframe

# Sum of time of all sessions by user
duration_of_activity = df_merged.groupby('user_programs_user_id')['sum_of_time'].sum()

# Merge the duration of activity with the user dataframe
df_users = df_users.merge(duration_of_activity,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# Rename the columns
df_users.rename(columns={'sum_of_time': 'duration_of_activity'}, inplace=True)


In [10]:
# Add the duration of sessions

# calculate the duration of sessions
duration_of_sessions = df_merged.groupby('user_programs_user_id')['sum_of_time'].agg(list)

# merge the duration of sessions with the user dataframe
df_users = df_users.merge(duration_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# Rename the columns
df_users.rename(columns={'sum_of_time': 'duration_of_activity_list'}, inplace=True)


In [11]:
# get month of the sessions
df_merged['month_of_sessions'] = df_merged['session_executions_updated_at'].dt.month

# get the list of the months of the sessions
month_of_sessions = df_merged.groupby('user_programs_user_id')['month_of_sessions'].agg(list)

# rename month of sessions from numbers to names

months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

month_of_sessions = month_of_sessions.apply(lambda x: [months[i] for i in x])

# merge the month of sessions with the user dataframe
df_users = df_users.merge(month_of_sessions,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [12]:
# session_executions_difficulty_feedback and session_executions_enjoyment_feedback

# get the mean of the session_executions_difficulty_feedback and session_executions_enjoyment_feedback
session_executions_difficulty_feedback = df_merged.groupby('user_programs_user_id')['session_executions_difficulty_feedback'].mean()

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_difficulty_feedback
session_executions_difficulty_feedback_list = df_merged.groupby('user_programs_user_id')['session_executions_difficulty_feedback'].agg(list)

# merge the session_executions_difficulty_feedback with the user dataframe
df_users = df_users.merge(session_executions_difficulty_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the mean of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback = df_merged.groupby('user_programs_user_id')['session_executions_enjoyment_feedback'].mean()

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# get the list of the session_executions_enjoyment_feedback
session_executions_enjoyment_feedback_list = df_merged.groupby('user_programs_user_id')['session_executions_enjoyment_feedback'].agg(list)

# merge the session_executions_enjoyment_feedback with the user dataframe
df_users = df_users.merge(session_executions_enjoyment_feedback_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

In [13]:
# rename columns
df_users.rename(columns={'session_executions_difficulty_feedback_x': 'mean_difficulty_feedback', 
                        'session_executions_difficulty_feedback_y': 'difficulty_feedback', 
                        'session_executions_enjoyment_feedback_x': 'mean_enjoyment_feedback', 
                        'session_executions_enjoyment_feedback_y': 'enjoyment_feedback'}, inplace=True)


In [14]:
# Calculate the duration from the first session to the last session exucuted

df_users['duration_from_first_to_last_session'] = df_users['Last_training_date'] - df_users['first_training_date']

In [15]:
# Small test to corroborate that all the users have at least one training session

'''
df2 = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')
df2.drop(columns=['session_executions_updated_at', 'session_executions_updated_at', 'user_programs_user_id', 'date', 'session_executions_summary_total_kcal', 'session_executions_summary_effort', 'session_executions_summary_points', 'session_executions_summary_value_of_session'], inplace=True)

# count the zero values for a rows
df2['zero_values'] = df2.apply(lambda x: x.value_counts().get(0, 0), axis=1)
print(df2['zero_values'].value_counts().keys().sort_values(ascending=False))
print(df2.shape)
'''
# In the precious prints we can observe that all the rows have at least 5 values, which means that at least one training session was executed.

"\ndf2 = pd.read_hdf('../data/filtered_dataset_v1.h5', key='data')\ndf2.drop(columns=['session_executions_updated_at', 'session_executions_updated_at', 'user_programs_user_id', 'date', 'session_executions_summary_total_kcal', 'session_executions_summary_effort', 'session_executions_summary_points', 'session_executions_summary_value_of_session'], inplace=True)\n\n# count the zero values for a rows\ndf2['zero_values'] = df2.apply(lambda x: x.value_counts().get(0, 0), axis=1)\nprint(df2['zero_values'].value_counts().keys().sort_values(ascending=False))\nprint(df2.shape)\n"

In [16]:
# Calculate the sum of time for each exercise excluding the rest time

df_merged['sum_of_time_excluding_rest_time'] = df_merged['sum_of_time'] - df_merged['Rest_sum_of_time_per_exercise']

# Calculate the mean of the sum of time for each exercise excluding the rest time

sum_of_time_excluding_rest_time = df_merged.groupby('user_programs_user_id')['sum_of_time_excluding_rest_time'].mean()

# merge the sum of time excluding rest time with the user dataframe

df_users = df_users.merge(sum_of_time_excluding_rest_time,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# Calculate the list of the sum of time for each exercise excluding the rest time

sum_of_time_excluding_rest_time_list = df_merged.groupby('user_programs_user_id')['sum_of_time_excluding_rest_time'].agg(list)

# merge the sum of time excluding rest time with the user dataframe

df_users = df_users.merge(sum_of_time_excluding_rest_time_list,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# rename columns
df_users.rename(columns={'sum_of_time_excluding_rest_time_x': 'mean_sum_of_time_excluding_rest_time', 
                        'sum_of_time_excluding_rest_time_y': 'sum_of_time_excluding_rest_time_list'}, inplace=True)


In [17]:
from datetime import datetime

dates_by_user = df_merged.groupby('user_programs_user_id')['session_executions_updated_at'].agg(list)
df_dates_by_user = dates_by_user.to_frame().reset_index()

def get_weeks_month_and_time_range(dates):
    
    all_weeks = {}
    all_months = {}
    all_time_ranges = {}
    for row in dates.iterrows():       
        
        dates = row[1]['session_executions_updated_at']
        start_date = dates[0]
        week_counts = {}
        month_counts = {}
        time_ranges = []

        for date in dates:
            week_number = (date - start_date).days // 7
            month_number = (date - start_date).days // 30
            week_key = f'week_{week_number}'
            month_key = f'month_{month_number}'

            # count the number of sessions per week
            if week_key in week_counts:
                week_counts[week_key] += 1
            else:
                week_counts[week_key] = 1

            # count the number of sessions per month
            if month_key in month_counts:
                month_counts[month_key] += 1
            else:
                month_counts[month_key] = 1
            
            # Add the time range of the session
            if date.time() >= datetime.strptime('05:30', '%H:%M').time() and date.time() <= datetime.strptime('12:30', '%H:%M').time():
                time_ranges.append('Morning')
            elif date.time() >= datetime.strptime('12:31', '%H:%M').time() and date.time() <= datetime.strptime('20:00', '%H:%M').time():
                time_ranges.append('Afternoon')
            else:
                time_ranges.append('Night')
        
        # append the week counts to the all weeks dictionary
        all_weeks[row[1]['user_programs_user_id']] = week_counts
        all_months[row[1]['user_programs_user_id']] = month_counts
        all_time_ranges[row[1]['user_programs_user_id']] = time_ranges
        
    return pd.Series(all_weeks), pd.Series(all_months), pd.Series(all_time_ranges)

# get the weeks, months and time ranges of the sessions
weeks_frequency, months_frequency, time_ranges_series = get_weeks_month_and_time_range(df_dates_by_user)

# add a name to the weeks pandas series
weeks_frequency.name = 'weeks_frequency'

# add a name to the months pandas series
months_frequency.name = 'months_frequency'

# add a name to the time ranges pandas series
time_ranges_series.name = 'time_ranges'

# merge the weeks of the sessions with the user dataframe
df_users = df_users.merge(weeks_frequency,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# merge the months of the sessions with the user dataframe
df_users = df_users.merge(months_frequency,
                          how ='inner',
                          left_index = True,
                          right_index= True)

# merge the time ranges of the sessions with the user dataframe
df_users = df_users.merge(time_ranges_series,
                          how ='inner',
                          left_index = True,
                          right_index= True)


In [18]:
# Get age from date of birth

df_users['users_age'] = (datetime.now() - df_users['users_date_of_birth']).dt.days // 365

In [19]:

# Export the user dataframe to a csv file
df_users.to_csv('../data/user_dataframe.csv')

In [25]:
# Read motivational cuestionary data

df_motivational = pd.read_csv('../data/cuestionario_motivacional_after_filtering.csv')

# remove id duplicates

df_motivational = df_motivational.drop_duplicates(subset=['id'], keep='last')

In [26]:
# Get users from the motivational cuestionary that are in the user dataframe

df_motivational = df_motivational[df_motivational['id'].isin(df_users.index)]

In [27]:
df_motivational = df_motivational[[
                 'id',
                 'Cuando hago una sesión de alta intensidad y acabo cansado, esto me anima a seguir realizando sesiones posteriores',
                 'El programa de ejercicio físico me ayuda a mejorar mi aspecto físico',
                 'Disfruto alcanzando los retos y metas que me propone el programa de ejercicio físico ',
                 'El programa de ejercicio físico me permite prevenir futuras lesiones',
                 'Disfruto cada momento de mi tiempo dedicado al ejercicio físico ',
                 'No soy capaz de imaginar mi vida sin practicar deporte',
                 'El programa de ejercicio físico me permite mantener un peso adecuado',
                 'El programa de ejercicio físico me proporciona una mejor imagen hacia los demás',
                 'El programa de ejercicio físico me permite mejorar mi salud',
                 'La actividad deportiva me permitirá progresar en otros ámbitos de mi vida',
                 'El programa de ejercicio físico evitará que tenga problemas de salud en el futuro',
                 'El deporte me ayuda a desconectar de los problemas diarios',
                 'Con el programa de ejercicio físico estoy mejorando mis marcas personales',
                 'El programa de ejercicio físico me permitirá mejorar mi vida social',
                 'Compito mucho mejor siguiendo el programa de ejercicio físico',
                 'Siento una gran satisfacción al comprobar que he integrado un programa de actividad física en mi vida',
                 ]]

In [31]:
# save the motivational cuestionary data to a csv file

df_motivational.to_csv('../data/motivational_cuestionary_users_dataframe_matched.csv')

In [30]:
df_motivational.shape

(325, 17)