In [1]:
# libraries

import pandas as pd
import numpy as np
import bz2
pd.set_option('display.max_columns', None)
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pingouin as pg
import statsmodels.api as sm
import matplotlib as mpl
from scipy import stats
from distfit import distfit
import scikit_posthocs as sp
import datetime as dt

pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings('ignore')

plt.rcParams["image.cmap"] = "Pastel2"


# planting seed

import random

random.seed(10)

# importing data

df = pd.read_csv("/home/evida-monika/mhunters/final_data_5.csv.bz2", compression="bz2", sep=",")

# this is cleaned dataset
# we need to delete another user - ID = 235, because it has 274 sessions in 4 weeks - impossible.

df = df[df['id_users'] != 235]

# changing types of data

cols = ['name_en_exercises','discarded_session_execution', 'code_name_sessions', 
        'name_en_sessions', 'active_user_programs', 'completed_user_programs',
        'pro_programs', 'available_programs', 'name_en_programs', 
        'gender', 
        'activity_level', 'goal', 'body_type', 'newsletter_subscription', 'notifications_setting',
        'language', 'scientific_data_usage', 'BMI_category', 'name_en_implements', 'cluster']

for col in cols:
    df[col] = df[col].astype('category')
    
col_date = ['updated_at_ex_ex', 'created_at_exercises', 'updated_at_exercises',
            'updated_at_session_execution', 'created_at_user_programs', 'updated_at_user_programs',
            'created_at_programs', 'updated_at_programs', 'created_at_users', 'updated_at_users',
            'date_of_birth']

for col in col_date:
    df[col] = pd.to_datetime(df[col])#.dt.strftime("%Y-%m-%d %H:%M:%S")

cat_names = {
    'gender': {1: 'male', 0: 'female'},
    'activity_level': {0: 'very active', 1: 'active', 2: 'sedentary'},
    'goal': {0: 'lose', 1: 'gain', 2: 'antiaging'},
    'body_type': {0: 'thin', 1: 'mid', 2: 'strong'}
}

df = df.replace(cat_names)


# Correlation plots and table definition

def corr_heatmap_p(df):
    rho, p = stats.spearmanr(df)
    
    rho = pd.DataFrame(rho).set_index(pd.Index(df.columns))
    rho.columns = df.columns
    
    p = pd.DataFrame(p).set_index(pd.Index(df.columns))
    p.columns = df.columns
    
    plt.figure(figsize=(12,7))
    heatmap = sns.heatmap(rho, vmin=-1, 
                      vmax=1, annot=True)
    plt.title("Spearman Correlation")
    
    
    s = set_title = np.vectorize(lambda ax,rho2: ax.title.set_text(
                                        '$\\rho$ = ' + 
                                        "{:.2f}".format(rho2)) if ax!=None else None
                            )      

    rho2 = df.corr(method="spearman")
    g = sns.PairGrid(df,corner=True)
    g.map_diag(plt.hist,color="yellow")
    g.map_lower(sns.scatterplot,color="magenta")
    set_title(g.axes,rho2)
    plt.subplots_adjust(hspace = 0.6)
    plt.show()
    
    for col in rho.columns:
        rho[col] = np.where((p[col] <= 0.05) & (p[col] > 0.01) & (col != rho.index),
                                          rho[col].round(2).astype('str') + '*', 
                                          np.where((p[col] <= 0.01) & (p[col] > 0.001) & (col != rho.index),
                                          rho[col].round(2).astype('str') + '**', 
                                          np.where((p[col] <= 0.001) & (col != rho.index),
                                          rho[col].round(2).astype('str') + '***', 
                                          rho[col])))
        
    

    return(rho)

# dataset only for users

df_users_only_once = df.drop_duplicates(subset=['id_users'], keep='first')

# here dropping all of the columns that are not relevant for users

df_users_only = df_users_only_once.loc[:, ['id_users', 'created_at_users', 'updated_at_users', 'gender',
       'date_of_birth', 'height', 'weight', 'activity_level', 'goal',
       'body_type', 'body_fat', 'newsletter_subscription',
       'notifications_setting', 'training_days_setting', 'language', 'points',
       'scientific_data_usage', 'best_weekly_streak_users', 'BMI', 'BMI_category', 'total_sessions_users2',
                                          'total_time_users2', 'total_reps_users2', 'reps_per_session_users2',
                                          'total_calories_users2', 'kcal_per_session_users2', 'exercises_per_user', 
                                        'exercises_without_rest_per_user', 'cluster']]

# dataset for only one of id_session_execution

df_session_exe_only_once = df.drop_duplicates(subset=['id_session_execution'], keep='first')

# here dropping all of the columns that are not relevant for session_execution

df_session_exe = df_session_exe_only_once.loc[:, ['id_session_execution', 
                                                  'difficulty_feedback_session_execution',
       'enjoyment_feedback_session_execution',
       'reps_executed_session_execution', 'updated_at_session_execution',
       'discarded_session_execution', 'id_sessions', 'order_sessions',
       'time_duration_sessions', 'code_name_sessions', 'name_en_sessions',
       'id_user_programs', 'created_at_user_programs',
       'updated_at_user_programs', 'active_user_programs',
       'current_session_id_user_programs', 'completed_user_programs',
       'id_programs', 'created_at_programs', 'updated_at_programs',
       'pro_programs', 'available_programs', 'strength_programs',
       'endurance_programs', 'technique_programs', 'flexibility_programs',
       'intensity_programs', 'name_en_programs', 'description_en_programs',
       'id_users', 'created_at_users', 'updated_at_users', 'gender',
       'date_of_birth', 'height', 'weight', 'activity_level', 'goal',
       'body_type', 'body_fat', 'newsletter_subscription',
       'notifications_setting', 'training_days_setting', 'language', 'points',
       'scientific_data_usage', 'best_weekly_streak_users', 'BMI', 'BMI_category', 'total_sessions_users2',
       'total_time_session_execution', 'total_reps_session_execution',
       'total_time_users2', 'total_reps_users2', 'reps_per_session_users2',
       'total_time_session_execution_min', 'reps_per_min_session_execution',
       'exercise_execution_time_min', 'calories_session_execution',
       'total_calories_users2', 'kcal_per_session_users2',
                                          'YYYY/WW', 'number_exercises_in_session',
                                    'number_exercises_without_rest_in_session']]

  return warn(


In [3]:
df_session_exe_dropped_columns = df_session_exe.copy()

In [7]:
df_session_exe_dropped_columns.drop(['points', 'best_weekly_streak_users', 
                                     'total_sessions_users2', 'total_time_users2',
                                     'total_reps_users2', 'reps_per_session_users2',
                                     'exercise_execution_time_min', 
                                     'total_calories_users2', 'kcal_per_session_users2'],
                                   axis = 1, inplace = True)

In [11]:
df_session_exe_dropped_columns.head()

Unnamed: 0,id_session_execution,difficulty_feedback_session_execution,enjoyment_feedback_session_execution,reps_executed_session_execution,updated_at_session_execution,discarded_session_execution,id_sessions,order_sessions,time_duration_sessions,code_name_sessions,name_en_sessions,id_user_programs,created_at_user_programs,updated_at_user_programs,active_user_programs,current_session_id_user_programs,completed_user_programs,id_programs,created_at_programs,updated_at_programs,pro_programs,available_programs,strength_programs,endurance_programs,technique_programs,flexibility_programs,intensity_programs,name_en_programs,description_en_programs,id_users,created_at_users,updated_at_users,gender,date_of_birth,height,weight,activity_level,goal,body_type,body_fat,newsletter_subscription,notifications_setting,training_days_setting,language,scientific_data_usage,BMI,BMI_category,total_time_session_execution,total_reps_session_execution,total_time_session_execution_min,reps_per_min_session_execution,calories_session_execution,YYYY/WW,number_exercises_in_session,number_exercises_without_rest_in_session,month_session_execution
0,6132.0,4.0,3.0,368.0,2021-11-01 12:05:40.078024,False,710.0,2.0,1105.0,PM1.2_V2,Session 2,7181.0,2021-10-25 11:04:40.657070,2021-11-28 11:26:22.131395,False,713.0,True,10.0,2020-11-23 13:41:46.587265,2021-09-29 14:53:34.897432,False,True,1.0,3.0,1.0,2.0,2.0,Get motivated!,The ultimate beginner’s program designed to he...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,988.0,368.0,16.47,22.35,50.95,2021/44,55,32,11
55,7495.0,5.0,3.0,150.0,2021-11-06 13:52:37.632519,False,711.0,3.0,386.0,PM1.3_V2,Session 3,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,635.0,150.0,10.58,14.17,38.32,2021/44,15,15,11
70,9800.0,8.0,4.0,286.0,2021-11-14 13:44:04.327400,False,320.0,7.0,962.0,MuHu7.1,Session 6,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,1322.0,286.0,22.03,12.98,72.86,2021/45,57,34,11
127,8444.0,7.0,4.0,72.0,2021-11-09 18:47:45.480112,False,713.0,5.0,662.0,PM1.5_V2,Session 5,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,685.0,80.0,11.42,7.01,37.74,2021/45,17,17,11
144,8075.0,6.0,4.0,150.0,2021-11-08 19:29:14.727155,False,634.0,1.0,1140.0,Descanso Activo 1,Regenerative,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,1269.0,155.0,21.15,7.33,50.15,2021/45,48,25,11


In [10]:
df_session_exe_dropped_columns['month_session_execution'] = pd.DatetimeIndex(df_session_exe_dropped_columns['updated_at_session_execution']).month


In [48]:
sessions_executed_11_2021 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 11]

In [49]:
sessions_executed_12_2021 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 12]

In [50]:
sessions_executed_01_2022 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 1]

In [51]:
sessions_executed_02_2022 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 2]

In [52]:
sessions_executed_03_2022 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 3]

In [53]:
sessions_executed_04_2022 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 4]

In [54]:
sessions_executed_05_2022 = df_session_exe_dropped_columns[df_session_exe_dropped_columns['month_session_execution'] == 5]

In [55]:
sessions_executed_11_2021.head()

Unnamed: 0,id_session_execution,difficulty_feedback_session_execution,enjoyment_feedback_session_execution,reps_executed_session_execution,updated_at_session_execution,discarded_session_execution,id_sessions,order_sessions,time_duration_sessions,code_name_sessions,name_en_sessions,id_user_programs,created_at_user_programs,updated_at_user_programs,active_user_programs,current_session_id_user_programs,completed_user_programs,id_programs,created_at_programs,updated_at_programs,pro_programs,available_programs,strength_programs,endurance_programs,technique_programs,flexibility_programs,intensity_programs,name_en_programs,description_en_programs,id_users,created_at_users,updated_at_users,gender,date_of_birth,height,weight,activity_level,goal,body_type,body_fat,newsletter_subscription,notifications_setting,training_days_setting,language,scientific_data_usage,BMI,BMI_category,total_time_session_execution,total_reps_session_execution,total_time_session_execution_min,reps_per_min_session_execution,calories_session_execution,YYYY/WW,number_exercises_in_session,number_exercises_without_rest_in_session,month_session_execution
0,6132.0,4.0,3.0,368.0,2021-11-01 12:05:40.078024,False,710.0,2.0,1105.0,PM1.2_V2,Session 2,7181.0,2021-10-25 11:04:40.657070,2021-11-28 11:26:22.131395,False,713.0,True,10.0,2020-11-23 13:41:46.587265,2021-09-29 14:53:34.897432,False,True,1.0,3.0,1.0,2.0,2.0,Get motivated!,The ultimate beginner’s program designed to he...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,988.0,368.0,16.47,22.35,50.95,2021/44,55,32,11
55,7495.0,5.0,3.0,150.0,2021-11-06 13:52:37.632519,False,711.0,3.0,386.0,PM1.3_V2,Session 3,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,635.0,150.0,10.58,14.17,38.32,2021/44,15,15,11
70,9800.0,8.0,4.0,286.0,2021-11-14 13:44:04.327400,False,320.0,7.0,962.0,MuHu7.1,Session 6,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,1322.0,286.0,22.03,12.98,72.86,2021/45,57,34,11
127,8444.0,7.0,4.0,72.0,2021-11-09 18:47:45.480112,False,713.0,5.0,662.0,PM1.5_V2,Session 5,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,685.0,80.0,11.42,7.01,37.74,2021/45,17,17,11
144,8075.0,6.0,4.0,150.0,2021-11-08 19:29:14.727155,False,634.0,1.0,1140.0,Descanso Activo 1,Regenerative,35844.0,2021-11-03 10:24:44.496283,2022-07-11 12:39:18.024392,True,315.0,False,34.0,2020-11-23 14:15:43.775009,2021-09-29 14:54:09.125824,True,True,5.0,4.0,3.0,3.0,5.0,Muscle Hunters Initiation 1,Muscle Hunters is a program created for you to...,2273.0,2021-10-25 11:04:40.560589,2022-09-07 12:39:46.135605,female,1981-12-31,163.0,66.0,active,lose,mid,15.0,False,True,3.0,es,True,24.84,Normal,1269.0,155.0,21.15,7.33,50.15,2021/45,48,25,11


In [34]:
trial = pd.DataFrame({'total_time_users': sessions_executed_11_2021.groupby(['id_users'])['total_time_session_execution'].sum()})

In [35]:
trial

Unnamed: 0_level_0,total_time_users
id_users,Unnamed: 1_level_1
172.00,5681.00
219.00,1338.00
514.00,4572.00
549.00,576.00
590.00,2189.00
...,...
12022.00,384.00
12027.00,1230.00
12036.00,749.00
12037.00,279.00


In [36]:
sessions_executed_11_2021 = sessions_executed_11_2021.merge(trial, how = 'left', left_on = 'id_users', right_on = 'id_users')


In [56]:
def change_tables(df):
    total_time_users = pd.DataFrame({'total_time_users': df.groupby(['id_users'])['total_time_session_execution'].sum()})
    df = df.merge(total_time_users, how = 'left', left_on = 'id_users', right_on = 'id_users')
    
    total_reps_users = pd.DataFrame({'total_reps_users': df.groupby(['id_users'])['total_reps_session_execution'].sum()})
    df = df.merge(total_reps_users, how = 'left', left_on = 'id_users', right_on = 'id_users')
    
    df['total_time_min'] = df['total_time_users']/60
    
    total_calories_users = pd.DataFrame({'total_calories_users': df.groupby(['id_users'])['calories_session_execution'].sum()})
    df = df.merge(total_calories_users, how = 'left', left_on = 'id_users', right_on = 'id_users')
    
    total_sessions_users = pd.DataFrame({'total_sessions_users': df.groupby(['id_users'])['id_session_execution'].nunique()})
    df = df.merge(total_sessions_users, how = 'left', left_on = 'id_users', right_on = 'id_users')

    df['reps_per_session_users'] = df['total_reps_users']/df['total_sessions_users']
    df['calories_per_session_users'] = df['total_calories_users']/df['total_sessions_users']
    
    total_exercises_users = pd.DataFrame({'total_exercises_users': df.groupby(['id_users'])['number_exercises_in_session'].sum()})
    df = df.merge(total_exercises_users, how = 'left', left_on = 'id_users', right_on = 'id_users')
    
    total_exercises_without_rest_users = pd.DataFrame({'total_exercises_without_rest_users': df.groupby(['id_users'])['number_exercises_without_rest_in_session'].sum()})
    df = df.merge(total_exercises_without_rest_users, how = 'left', left_on = 'id_users', right_on = 'id_users')

    return df
    
    

In [60]:
df_sessions_executed_11_2021 = change_tables(sessions_executed_11_2021)

In [63]:
df_sessions_executed_11_2021.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_11_2021.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [61]:
df_sessions_executed_12_2021 = change_tables(sessions_executed_12_2021)
df_sessions_executed_1_2022 = change_tables(sessions_executed_01_2022)
df_sessions_executed_2_2022 = change_tables(sessions_executed_02_2022)
df_sessions_executed_3_2022 = change_tables(sessions_executed_03_2022)
df_sessions_executed_4_2022 = change_tables(sessions_executed_04_2022)
df_sessions_executed_5_2022 = change_tables(sessions_executed_05_2022)

In [62]:
df_sessions_executed_12_2021.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_12_2021.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [65]:
df_sessions_executed_1_2022.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_01_2022.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [66]:
df_sessions_executed_2_2022.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_02_2022.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [67]:
df_sessions_executed_3_2022.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_03_2022.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [68]:
df_sessions_executed_4_2022.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_04_2022.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [69]:
df_sessions_executed_5_2022.to_csv(r'/home/evida-monika/mhunters/df_sessions_executed_05_2022.csv'+ '.bz2', header=True, index=False,compression='bz2')
