# Data cleaning - stage 1 (Cleaning individual tables) 

## Overview

This document apply a first data cleaning for indiviual tables, the aspects considered to clean the data are:


**Author**: Oscar Javier Bastidas Jossa. 

**Email**: oscar.jossa@deusto.es.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Table of users
users = pd.read_csv('data/users.csv', low_memory=False)
users2 = users.drop(['email', 'encrypted_password', 
                     'reset_password_token','reset_password_sent_at',
                     'remember_created_at','is_admin','names', 'last_name',
                     'current_sign_in_ip', 'last_sign_in_ip', 
                     'recover_password_code','recover_password_attempts', 
                     'facebook_uid','workout_setting_voice_coach', 'workout_setting_sound',
                     'workout_setting_vibration', 'workout_setting_mobility',
                     'workout_setting_cardio_warmup', 'workout_setting_countdown',
                     'google_uid','t1_push','t1_core', 
                     't1_legs', 't1_full', 't1_push_exercise', 
                     't1_pull_up','t2_reps', 't2_steps', 
                     't2_reps_push', 't2_reps_core', 't2_reps_legs',
                     't2_reps_full', 't2_time_push', 't2_time_core',
                     't2_time_legs', 't2_time_full', 't1_full_exercise', 
                     't1_pull_up_exercise','warmup_setting', 
                     'warmup_session_id', 'stripe_id', 'provider', 'uid',
                     'affiliate_code', 'moengage_id', 'mix_panel_id',
                     'apple_id_token','platform', 'login_token',
                     'login_token_generated_at', 'imported',
                     'current_sign_in_at', 'last_sign_in_at', 'sign_in_count',
                     'current_weekly_streak'], 
                      axis = 1)

# Loading data
session_executions = pd.read_csv('data/session_executions.csv', 
                                 on_bad_lines='skip', # skip bad lines without raising or warning when they are encountered. 
                                 header = None)

# assigning the headers
session_executions.columns = ['id', 'scheduled_at', 'user_program_id', 
                              'difficulty_feedback', 'enjoyment_feedback',
                              'feedback_comment', 'reps_executed',
                              'execution_time', 'order', 'created_at',
                              'updated_at', 'front_end_id', 'session_id',
                              'discarded', 'discard_reason', 'imported']

session_executions2 = session_executions.drop(['scheduled_at', 'feedback_comment',
                                              'order', 'front_end_id'], axis = 1)


# Table of blocks of session executions
session_block_executions = pd.read_csv('data/session_block_executions.csv', on_bad_lines='skip', low_memory=False)
session_block_executions2 = session_block_executions.drop(['block_type', 
                                                           'reps_executed',
                                                           'execution_time', 
                                                           'created_at', 
                                                           'updated_at'], axis = 1)

# Table of sets of executions
session_set_executions = pd.read_csv('data/session_set_executions.csv', on_bad_lines='skip', low_memory=False)
session_set_executions2 = session_set_executions.drop(['reps_executed', 'execution_time',
                                                       'created_at', 'updated_at'], axis = 1)

# Table of user programs
user_programs = pd.read_csv('data/user_programs.csv', on_bad_lines='skip', low_memory=False)
user_programs2 = user_programs.drop(['enjoyment_notes'], axis = 1)

# Table of sessions 
sessions = pd.read_csv('data/sessions.csv', on_bad_lines='skip', low_memory=False)
sessions2 = sessions.drop(['level', 'reps', 'created_at', 'updated_at', 'strength', 
                           'endurance', 'technique', 'flexibility', 'intensity',
                           'name_es','description_en', 'description_es'], axis = 1)

# Table of exercise executions 
exercise_executions = pd.read_csv('data/exercise_executions.csv', on_bad_lines='skip', low_memory=False, header = None)
exercise_executions.columns = ['id', 'exercise_id', 'session_set_execution_id', 
                              'reps_executed', 'execution_time',
                              'order', 'created_at', 'updated_at']

# Table of programs
programs = pd.read_csv('data/programs.csv', on_bad_lines='skip', low_memory=False)
programs2 = programs.drop(['user_id', 'code_name', 'name_es', 
                           'description_es', 'auto_generated', 'priority_order', 
                           'next_program_id'], axis = 1)

# Table of exercises
exercises = pd.read_csv('data/exercises.csv', sep = ';', on_bad_lines='skip', low_memory=False)
exercises2 = exercises.drop(['video','reps', 'time','legacy_id','deprecated', 
                             'replacement_legacy_id', 'family', 'sub_family',
                             'video_female', 'video_male', 'harder_variation_id',
                             'easier_variation_id', 'name_es', 'description_en',
       'description_es', 'implement_variation_id', 'test_correction',
       'thumbnail', 'thumbnail_male', 'thumbnail_female', 'notes_en',
       'notes_es', 'execution_time', 'thumbnail_400', 'thumbnail_400_male',
       'thumbnail_400_female', 'coach_id', 'test_equivalent_id', 't1_min',
       't1_max', 'excluded'], axis = 1)

# Table of implements
implements = pd.read_csv('data/implements.csv', on_bad_lines='skip', low_memory=False)
implements2 = implements.drop(['created_at', 'updated_at', 'name_es'], axis = 1)

# Table of program implements
program_implements = pd.read_csv('data/program_implements.csv', on_bad_lines='skip', low_memory=False)
program_implements2 = program_implements.drop(['created_at', 'updated_at'], axis = 1)

# Table of user implements
user_implements = pd.read_csv('data/user_implements.csv', on_bad_lines='skip', low_memory=False)
user_implements2 = user_implements.drop(['created_at', 'updated_at'], axis = 1)

# Table of exercise implements 
exercise_implements = pd.read_csv('data/exercise_implements.csv', on_bad_lines='skip', low_memory=False)
exercise_implements2 = exercise_implements.drop(['created_at', 'updated_at'], axis = 1)

# Table of profiles
profiles = pd.read_csv('data/profiles.csv')
profiles2 = profiles.drop(['fat_level', 'name', 'created_at', 'updated_at'], axis = 1)

# Table of suscriptions
subscriptions = pd.read_csv('data/subscriptions.csv', on_bad_lines='skip', low_memory=False)
subscriptions2 = subscriptions.drop(['platform', 'transaction_body', 'start_date', 
                                     'end_date', 'subscription_type', 'cancelled_at',
                                     'cancelled', 'store_metadata','offer_code',
       'cancellation_reason', 'receipt_data'], axis = 1)

# Table of exercise sets
exercise_sets = pd.read_csv('data/exercise_sets.csv', on_bad_lines='skip', low_memory=False)
exercise_sets2 = exercise_sets.drop(['intensity_modificator', 'track_reps'], axis = 1)


session_blocks = pd.read_csv('data/session_blocks.csv', on_bad_lines='skip', low_memory=False, header = None)
session_blocks.columns = ['id', 'session_id', 'time_duration', 
                              'created_at', 'updated_at',
                              'order', 'block_type', 'loop']

# Table of program sessions
program_sessions = pd.read_csv('data/program_sessions.csv', on_bad_lines='skip', low_memory=False)

# Table of profiles
program_profiles = pd.read_csv('data/program_profiles.csv', on_bad_lines='skip', low_memory=False)

# Table of program characteristics
program_characteristics = pd.read_csv('data/program_characteristics.csv', on_bad_lines='skip', low_memory=False)
program_characteristics2 = program_characteristics.drop(['created_at', 'updated_at', 
                                                         'objective', 'value_en',
                                                         'value_es'], axis = 1)

# Table of session sets
session_sets = pd.read_csv('data/session_sets.csv', on_bad_lines='skip', low_memory=False)
session_sets2 = session_sets.drop(['level', 'time_duration', 'reps', 
                                   'session_set_type','created_at', 'updated_at',
                                   'loop'], axis = 1)

  session_executions = pd.read_csv('data/session_executions.csv',


In [5]:
users2['height[m]'] = users2['height']/100


BMI = []

for i in range(len(users2)):
    if users2['height[m]'][i] < 1.0:
        BMI.append(np.nan)
    else:
        BMI.append((users2['weight'][i]/(users2['height[m]'][i]**2)).round(2))

users2['BMI'] = BMI
users2['BMI'] = users2['BMI'].astype(float)
        
BMI_cat = []

for i in range(len(users2['BMI'])):
    if np.isnan(users2['BMI'][i]) == True:
        BMI_cat.append(np.nan)
    elif (users2['BMI'][i] > 0 and users2['BMI'][i] < 18.5):
        BMI_cat.append('Underweight')
    elif (users2['BMI'][i] >= 18.5 and users2['BMI'][i] < 25):
        BMI_cat.append('Normal')
    elif (users2['BMI'][i] >= 25 and users2['BMI'][i] < 30):
        BMI_cat.append('Overweight')
    elif users2['BMI'][i] >= 30:
        if users2['height[m]'][i] < 1.0:
            BMI_cat.append(np.nan)
        else: 
            BMI_cat.append('Obesity')
    else: BMI_cat.append(np.nan)
        
        

users2['BMI_category'] = BMI_cat

In [6]:
users2['country'].replace('Argentina', 'AR', inplace = True)
users2['country'].replace('Spain', 'ES', inplace = True)

cat_names = {
    'gender': {1: 'male', 0: 'female'},
    'activity_level': {0: 'very active', 1: 'active', 2: 'sedentary'},
    'goal': {0: 'lose', 1: 'gain', 2: 'antiaging'},
    'body_type': {0: 'thin', 1: 'mid', 2: 'strong'}
}

users2 = users2.replace(cat_names)

In [7]:
session_executions2.replace('\\N', np.NaN, inplace = True)
exercise_executions.replace('\\N', np.NaN, inplace = True)

In [12]:
merge_1 = exercise_executions.merge(session_set_executions2, how = 'left', 
                                    left_on = 'session_set_execution_id',
                                    right_on = 'id')

merge_1.columns
merge_1.rename(columns = {'id_x': 'id_ex_ex', 'order_x': 'order_ex_ex', 
                          'id_y': 'id_session_set_ex', 'order_y': 'order_session_set_ex',
                          'exercise_id': 'exercise_id_ex_ex', 'reps_executed': 'reps_executed_ex_ex',
                          'execution_time': 'execution_time_ex_ex', 
                          'created_at': 'created_at_ex_ex', 'updated_at': 'updated_at_ex_ex'}, 
               inplace = True)

merge_1

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,session_block_execution_id
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,7077.0
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,7077.0
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,7077.0
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,7077.0
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,7077.0
...,...,...,...,...,...,...,...,...,...,...,...
2190284,3116554,5968,669974,0,31,2,2022-05-27 07:41:53.378292,2022-05-27 07:41:53.378292,669974.0,1.0,160098.0
2190285,3116555,5870,669975,6,31,1,2022-05-27 07:41:53.386738,2022-05-27 07:41:53.386738,669975.0,2.0,160098.0
2190286,3116556,5968,669975,0,31,2,2022-05-27 07:41:53.390694,2022-05-27 07:41:53.390694,669975.0,2.0,160098.0
2190287,3116557,5870,669976,6,24,1,2022-05-27 07:41:53.399237,2022-05-27 07:41:53.399237,669976.0,3.0,160098.0


In [19]:
exercise_executions.shape

(2190289, 8)

In [20]:
merge_1.shape

(2190289, 11)

In [21]:
merge_1_1 = merge_1.merge(exercises2, how = 'left', left_on = 'exercise_id_ex_ex', right_on = 'id')

merge_1_1.columns

merge_1_1.rename(columns = {'id': 'id_exercises',
                          'created_at': 'created_at_exercises', 'updated_at': 'updated_at_exercises',
                           'body_parts_focused': 'body_parts_focused_exercises', 
                           'muscles': 'muscles_exercises',
                           'joints': 'joints_exercises',
                           'met_multiplier': 'met_multiplier_exercises', 
                           'name_en': 'name_en_exercises'}, 
               inplace = True)


In [22]:
merge_1_1.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,session_block_execution_id,id_exercises,created_at_exercises,updated_at_exercises,body_parts_focused_exercises,muscles_exercises,joints_exercises,met_multiplier_exercises,name_en_exercises
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,7077.0,5236,2020-10-15 12:37:15.906971,2022-01-11 08:02:28.410085,"{""Todo el cuerpo""}","{pectorales,"" tríceps"","" cuádriceps""}","{hombros,"" cadera""}",5.0,Burpee
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,7077.0,5968,2020-10-16 09:33:15.172657,2022-01-21 19:15:56.401181,{},{0},{0},1.0,Rest
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,7077.0,5317,2020-10-15 12:37:16.978807,2021-10-13 09:46:29.459619,{Core},"{""recto mayor del abdomen"","" abdominales oblic...","{hombros,"" cadera"","" rodillas"","" tobillos""}",4.2,Mountain runner
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,7077.0,5968,2020-10-16 09:33:15.172657,2022-01-21 19:15:56.401181,{},{0},{0},1.0,Rest
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,7077.0,5222,2020-10-15 12:37:15.765512,2021-10-13 09:46:28.727847,{Core},"{tríceps,"" dorsales""}","{muñecas,"" codos"","" cadera""}",2.5,Mckenzie


In [23]:
merge_1_1.shape

(2190289, 19)

In [24]:
merge_2 = merge_1_1.merge(session_block_executions2, how = 'left', left_on = 'session_block_execution_id',
                       right_on = 'id')

merge_2.columns
merge_2.rename(columns = {'order': 'order_session_block_ex', 'id': 'id_session_block_ex'}, inplace = True)

In [25]:
merge_2.tail()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,created_at_exercises,updated_at_exercises,body_parts_focused_exercises,muscles_exercises,joints_exercises,met_multiplier_exercises,name_en_exercises,id_session_block_ex,session_execution_id,order_session_block_ex
2190284,3116554,5968,669974,0,31,2,2022-05-27 07:41:53.378292,2022-05-27 07:41:53.378292,669974.0,1.0,...,2020-10-16 09:33:15.172657,2022-01-21 19:15:56.401181,{},{0},{0},1.0,Rest,160098.0,746515.0,3.0
2190285,3116555,5870,669975,6,31,1,2022-05-27 07:41:53.386738,2022-05-27 07:41:53.386738,669975.0,2.0,...,2020-10-15 12:37:23.297018,2021-10-13 09:46:31.224065,{Piernas},{isquiotibiales},"{rodillas,"" hombros""}",3.8,Nordic curl,160098.0,746515.0,3.0
2190286,3116556,5968,669975,0,31,2,2022-05-27 07:41:53.390694,2022-05-27 07:41:53.390694,669975.0,2.0,...,2020-10-16 09:33:15.172657,2022-01-21 19:15:56.401181,{},{0},{0},1.0,Rest,160098.0,746515.0,3.0
2190287,3116557,5870,669976,6,24,1,2022-05-27 07:41:53.399237,2022-05-27 07:41:53.399237,669976.0,3.0,...,2020-10-15 12:37:23.297018,2021-10-13 09:46:31.224065,{Piernas},{isquiotibiales},"{rodillas,"" hombros""}",3.8,Nordic curl,160098.0,746515.0,3.0
2190288,3116558,5968,669976,0,31,2,2022-05-27 07:41:53.403198,2022-05-27 07:41:53.403198,669976.0,3.0,...,2020-10-16 09:33:15.172657,2022-01-21 19:15:56.401181,{},{0},{0},1.0,Rest,160098.0,746515.0,3.0


In [26]:
merge_2.shape

(2190289, 22)

In [27]:
merge_3 = merge_2.merge(session_executions2, how = 'left', left_on = 'session_execution_id',
                       right_on = 'id')

merge_3.rename(columns = {'id': 'id_session_execution', 
                          'difficulty_feedback': 'difficulty_feedback_session_execution',
                          'enjoyment_feedback': 'enjoyment_feedback_session_execution',
                          'reps_executed': 'reps_executed_session_execution',
                          'execution_time': 'execution_time_session_execution',
                          'discarded': 'discarded_session_execution',
                          'discard_reason': 'discard_reason_session_execution',
                          'updated_at': 'updated_at_session_execution'}, 
               inplace = True)

merge_3.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,id_session_execution,user_program_id,difficulty_feedback_session_execution,enjoyment_feedback_session_execution,reps_executed_session_execution,execution_time_session_execution,updated_at_session_execution,session_id,discarded_session_execution,discard_reason_session_execution
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,5564.0,5462.0,5,4,244,,2021-10-29 13:07:02.352803,754.0,f,
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,5564.0,5462.0,5,4,244,,2021-10-29 13:07:02.352803,754.0,f,
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,...,5564.0,5462.0,5,4,244,,2021-10-29 13:07:02.352803,754.0,f,
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,...,5564.0,5462.0,5,4,244,,2021-10-29 13:07:02.352803,754.0,f,
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,...,5564.0,5462.0,5,4,244,,2021-10-29 13:07:02.352803,754.0,f,


In [35]:
merge_3.shape

(2190289, 31)

In [28]:
merge_4 = merge_3.merge(sessions2, how = 'left', left_on = 'session_id', right_on = 'id')

merge_4.rename(columns = {'id': 'id_sessions', 
                          'order': 'order_sessions',
                          'session_type': 'session_type_sessions',
                          'time_duration': 'time_duration_sessions',
                          'code_name': 'code_name_sessions',
                          'name_en': 'name_en_sessions',
                          'calories': 'calories_sessions',
                          'warmup_id': 'warmup_id_sessions',
                          'cooldown_id': 'cooldown_id_sessions'}, 
               inplace = True)

merge_4.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,discard_reason_session_execution,id_sessions,order_sessions,session_type_sessions,time_duration_sessions,code_name_sessions,name_en_sessions,calories_sessions,warmup_id_sessions,cooldown_id_sessions
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,,754.0,1.0,,1059.0,PH7.1_V2,Session 1,,,
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,,754.0,1.0,,1059.0,PH7.1_V2,Session 1,,,
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,...,,754.0,1.0,,1059.0,PH7.1_V2,Session 1,,,
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,...,,754.0,1.0,,1059.0,PH7.1_V2,Session 1,,,
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,...,,754.0,1.0,,1059.0,PH7.1_V2,Session 1,,,


In [16]:
merge_4.shape

(2190289, 40)

In [29]:
merge_5 = merge_4.merge(user_programs2, how = 'left', left_on = 'user_program_id', right_on = 'id')

merge_5.rename(columns = {'id': 'id_user_programs', 
                          'user_id': 'user_id_user_programs',
                          'program_id': 'program_id_user_programs',
                          'created_at': 'created_at_user_programs',
                          'updated_at': 'updated_at_user_programs',
                          'active': 'active_user_programs',
                          'current_session_id': 'current_session_id_user_programs',
                          'completed': 'completed_user_programs', 
                          'enjoyment': 'enjoyment_user_programs'}, 
               inplace = True)

merge_5.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,cooldown_id_sessions,id_user_programs,user_id_user_programs,program_id_user_programs,created_at_user_programs,updated_at_user_programs,active_user_programs,current_session_id_user_programs,completed_user_programs,enjoyment_user_programs
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,,5462.0,1718.0,22.0,2021-10-25 11:02:12.143768,2022-07-16 16:39:20.966637,f,763.0,t,4.0
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,,5462.0,1718.0,22.0,2021-10-25 11:02:12.143768,2022-07-16 16:39:20.966637,f,763.0,t,4.0
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,...,,5462.0,1718.0,22.0,2021-10-25 11:02:12.143768,2022-07-16 16:39:20.966637,f,763.0,t,4.0
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,...,,5462.0,1718.0,22.0,2021-10-25 11:02:12.143768,2022-07-16 16:39:20.966637,f,763.0,t,4.0
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,...,,5462.0,1718.0,22.0,2021-10-25 11:02:12.143768,2022-07-16 16:39:20.966637,f,763.0,t,4.0


In [18]:
merge_5.shape

(2190289, 49)

In [30]:
merge_6 = merge_5.merge(programs2, how = 'left', left_on = 'program_id_user_programs', right_on = 'id')

merge_6.rename(columns = {'id': 'id_programs', 
                          'created_at': 'created_at_programs',
                          'updated_at': 'updated_at_programs',
                          'pro': 'pro_programs',
                          'available': 'available_programs',
                          'strength': 'strength_programs',
                          'endurance': 'endurance_programs',
                          'technique': 'technique_programs',
                          'flexibility': 'flexibility_programs',
                          'intensity': 'intensity_programs',
                          'name_en': 'name_en_programs',
                          'description_en': 'description_en_programs'}, 
               inplace = True)

merge_6.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,updated_at_programs,pro_programs,available_programs,strength_programs,endurance_programs,technique_programs,flexibility_programs,intensity_programs,name_en_programs,description_en_programs
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,2021-09-29 14:53:34.268573,f,t,5.0,4.0,3.0,3.0,4.0,Smash your goals,The ultimate program that prepares your body t...
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,2021-09-29 14:53:34.268573,f,t,5.0,4.0,3.0,3.0,4.0,Smash your goals,The ultimate program that prepares your body t...
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,...,2021-09-29 14:53:34.268573,f,t,5.0,4.0,3.0,3.0,4.0,Smash your goals,The ultimate program that prepares your body t...
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,...,2021-09-29 14:53:34.268573,f,t,5.0,4.0,3.0,3.0,4.0,Smash your goals,The ultimate program that prepares your body t...
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,...,2021-09-29 14:53:34.268573,f,t,5.0,4.0,3.0,3.0,4.0,Smash your goals,The ultimate program that prepares your body t...


In [20]:
merge_6.shape

(2190289, 61)

In [31]:
merge_7 = merge_6.merge(users2, how = 'left', left_on = 'user_id_user_programs', right_on = 'id')

merge_7.rename(columns = {'id': 'id_users', 
                          'created_at': 'created_at_users',
                          'updated_at': 'updated_at_users',
                          'best_weekly_streak': 'best_weekly_streak_users',
                          'total_sessions': 'total_sessions_users',
                          'total_time': 'total_time_users',
                          'kcal_per_session': 'kcal_per_session_users',
                          'reps_per_session': 'reps_per_session_users'}, 
               inplace = True)

merge_7.replace('t', True, inplace = True)
merge_7.replace('f', False, inplace = True)


merge_7.head()


Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,scientific_data_usage,best_weekly_streak_users,affiliate_code_signup,total_sessions_users,total_time_users,kcal_per_session_users,reps_per_session_users,height[m],BMI,BMI_category
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,False,44.0,,475.0,622509.0,0.350213,2.0,1.84,24.52,Normal
1,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,False,44.0,,475.0,622509.0,0.350213,2.0,1.84,24.52,Normal
2,1279662,5317,47654,10,11,3,2021-10-29 13:07:01.92973,2021-10-29 13:07:01.92973,47654.0,1.0,...,False,44.0,,475.0,622509.0,0.350213,2.0,1.84,24.52,Normal
3,1279663,5968,47654,0,15,4,2021-10-29 13:07:01.934805,2021-10-29 13:07:01.934805,47654.0,1.0,...,False,44.0,,475.0,622509.0,0.350213,2.0,1.84,24.52,Normal
4,1279664,5222,47654,10,19,5,2021-10-29 13:07:01.940808,2021-10-29 13:07:01.940808,47654.0,1.0,...,False,44.0,,475.0,622509.0,0.350213,2.0,1.84,24.52,Normal


In [22]:
merge_7.shape

(2190289, 88)

In [23]:
merge_7.columns

Index(['id_ex_ex', 'exercise_id_ex_ex', 'session_set_execution_id',
       'reps_executed_ex_ex', 'execution_time_ex_ex', 'order_ex_ex',
       'created_at_ex_ex', 'updated_at_ex_ex', 'id_session_set_ex',
       'order_session_set_ex', 'session_block_execution_id', 'id_exercises',
       'created_at_exercises', 'updated_at_exercises',
       'body_parts_focused_exercises', 'muscles_exercises', 'joints_exercises',
       'met_multiplier_exercises', 'name_en_exercises', 'id_session_block_ex',
       'session_execution_id', 'order_session_block_ex',
       'id_session_execution', 'user_program_id',
       'difficulty_feedback_session_execution',
       'enjoyment_feedback_session_execution',
       'reps_executed_session_execution', 'execution_time_session_execution',
       'session_id', 'discarded_session_execution',
       'discard_reason_session_execution', 'id_sessions', 'order_sessions',
       'session_type_sessions', 'time_duration_sessions', 'code_name_sessions',
       'name_en_

In [24]:
session_blocks2['session_id'].nunique()

1562

In [36]:
session_blocks2['session_id'].shape

(2986,)

There are duplicates in session_blocks2

In [25]:
session_blocks2.head(10)

Unnamed: 0,id,session_id,order,block_type
0,62,41,1,0
1,3239,1849,1,19
2,3240,1850,1,19
3,3241,1851,1,19
4,5,8,1,0
5,6,9,1,0
6,7,12,1,0
7,8,13,1,0
8,9,13,3,0
9,10,14,1,0


NameError: name 'merge_9' is not defined

In [14]:
'''

merge_8 = merge_7.merge(session_blocks2, how = 'left', left_on = 'id_sessions', right_on = 'session_id')


merge_8.rename(columns = {'id': 'id_session_blocks', 
                          'session_id_y': 'session_id_session_blocks',
                          'order': 'order_session_blocks',
                        'block_type': 'block_type_session_blocks'}, 
               inplace = True)
'''


In [27]:
merge_8.shape

(6707127, 92)

In [28]:
merge_8.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,total_time_users,kcal_per_session_users,reps_per_session_users,height[m],BMI,BMI_category,id_session_blocks,session_id_session_blocks,order_session_blocks,block_type_session_blocks
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,622509.0,0.350213,2.0,1.84,24.52,Normal,1849.0,754.0,1,18.0
1,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,622509.0,0.350213,2.0,1.84,24.52,Normal,1850.0,754.0,2,16.0
2,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,622509.0,0.350213,2.0,1.84,24.52,Normal,1851.0,754.0,3,18.0
3,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,622509.0,0.350213,2.0,1.84,24.52,Normal,1852.0,754.0,4,18.0
4,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,622509.0,0.350213,2.0,1.84,24.52,Normal,1849.0,754.0,1,18.0


In [29]:
merge_8.columns

Index(['id_ex_ex', 'exercise_id_ex_ex', 'session_set_execution_id',
       'reps_executed_ex_ex', 'execution_time_ex_ex', 'order_ex_ex',
       'created_at_ex_ex', 'updated_at_ex_ex', 'id_session_set_ex',
       'order_session_set_ex', 'session_block_execution_id', 'id_exercises',
       'created_at_exercises', 'updated_at_exercises',
       'body_parts_focused_exercises', 'muscles_exercises', 'joints_exercises',
       'met_multiplier_exercises', 'name_en_exercises', 'id_session_block_ex',
       'session_execution_id', 'order_session_block_ex',
       'id_session_execution', 'user_program_id',
       'difficulty_feedback_session_execution',
       'enjoyment_feedback_session_execution',
       'reps_executed_session_execution', 'execution_time_session_execution',
       'session_id_x', 'discarded_session_execution',
       'discard_reason_session_execution', 'id_sessions', 'order_sessions',
       'session_type_sessions', 'time_duration_sessions', 'code_name_sessions',
       'name_e

ID columns:

* id_ex_ex
* **exercise_id_ex_ex**
* **session_set_execution_id**
* id_session_set_ex (the same as session_set_execution_id)
* **session_block_execution_id**
* id_exercises (the same as exercise_id_ex_ex)
* id_session_block_ex (the same as session_block_execution_id)
* **session_execution_id**
* id_session_execution (the same as session_execution_id)
* **user_program_id**
* **session_id_x**
* id_sessions (the same as session_id_x)
* warmup_id_sessions
* cooldown_id_sessions
* id_user_programs (the same as user_program_id)
* **user_id_user_programs**
* **program_id_user_programs**
* current_session_id_user_programs
* id_programs (the same as program_id_user_programs)
* id_users (the same as user_id_user_programs)
* id_session_blocks
* session_id_session_blocks (the same as session_id_x)


In [33]:
merge_8 = merge_8.drop(['exercise_id_ex_ex',
             'session_set_execution_id',
             'session_block_execution_id',
             'session_execution_id',
             'user_program_id',
             'session_id_x',
             'user_id_user_programs',
             'program_id_user_programs'],
           axis = 1)

NameError: name 'merge_8' is not defined

In [40]:
merge_8.shape

(6707127, 84)

In [41]:
exercise_implements2.shape

(239, 3)

In [34]:
merge_9 = merge_7.merge(exercise_implements2, how = 'left', left_on = 'id_exercises', right_on = 'exercise_id')


merge_9.rename(columns = {'id': 'id_exercise_implements',
                            'exercise_id': 'exercise_id_exercise_implements',
                            'implement_id': 'implement_id_exercise_implements'},
                inplace = True)

merge_9.shape


(2211535, 92)

In [32]:
merge_9.columns

Index(['id_ex_ex', 'reps_executed_ex_ex', 'execution_time_ex_ex',
       'order_ex_ex', 'created_at_ex_ex', 'updated_at_ex_ex',
       'id_session_set_ex', 'order_session_set_ex', 'id_exercises',
       'created_at_exercises', 'updated_at_exercises',
       'body_parts_focused_exercises', 'muscles_exercises', 'joints_exercises',
       'met_multiplier_exercises', 'name_en_exercises', 'id_session_block_ex',
       'order_session_block_ex', 'id_session_execution',
       'difficulty_feedback_session_execution',
       'enjoyment_feedback_session_execution',
       'reps_executed_session_execution', 'execution_time_session_execution',
       'discarded_session_execution', 'discard_reason_session_execution',
       'id_sessions', 'order_sessions', 'session_type_sessions',
       'time_duration_sessions', 'code_name_sessions', 'name_en_sessions',
       'calories_sessions', 'warmup_id_sessions', 'cooldown_id_sessions',
       'id_user_programs', 'created_at_user_programs',
       'updated_a

In [33]:
merge_9.head()

Unnamed: 0,id_ex_ex,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,id_exercises,created_at_exercises,...,height[m],BMI,BMI_category,id_session_blocks,session_id_session_blocks,order_session_blocks,block_type_session_blocks,id_exercise_implements,exercise_id_exercise_implements,implement_id_exercise_implements
0,1279660,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,5236,2020-10-15 12:37:15.906971,...,1.84,24.52,Normal,1849.0,754.0,1,18.0,,,
1,1279660,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,5236,2020-10-15 12:37:15.906971,...,1.84,24.52,Normal,1850.0,754.0,2,16.0,,,
2,1279660,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,5236,2020-10-15 12:37:15.906971,...,1.84,24.52,Normal,1851.0,754.0,3,18.0,,,
3,1279660,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,5236,2020-10-15 12:37:15.906971,...,1.84,24.52,Normal,1852.0,754.0,4,18.0,,,
4,1279661,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,5968,2020-10-16 09:33:15.172657,...,1.84,24.52,Normal,1849.0,754.0,1,18.0,,,


In [35]:
merge_10 = merge_9.merge(implements2, how = 'left', left_on = 'implement_id_exercise_implements', right_on = 'id')


merge_10.rename(columns = {'id': 'id_implements',
                            'name_en': 'name_en_implements'},
                inplace = True)

merge_10.shape

(2211535, 94)

In [36]:
merge_10.columns

Index(['id_ex_ex', 'exercise_id_ex_ex', 'session_set_execution_id',
       'reps_executed_ex_ex', 'execution_time_ex_ex', 'order_ex_ex',
       'created_at_ex_ex', 'updated_at_ex_ex', 'id_session_set_ex',
       'order_session_set_ex', 'session_block_execution_id', 'id_exercises',
       'created_at_exercises', 'updated_at_exercises',
       'body_parts_focused_exercises', 'muscles_exercises', 'joints_exercises',
       'met_multiplier_exercises', 'name_en_exercises', 'id_session_block_ex',
       'session_execution_id', 'order_session_block_ex',
       'id_session_execution', 'user_program_id',
       'difficulty_feedback_session_execution',
       'enjoyment_feedback_session_execution',
       'reps_executed_session_execution', 'execution_time_session_execution',
       'updated_at_session_execution', 'session_id',
       'discarded_session_execution', 'discard_reason_session_execution',
       'id_sessions', 'order_sessions', 'session_type_sessions',
       'time_duration_sessions',

In [37]:
merge_10 = merge_10.drop(['exercise_id_exercise_implements',
                         'implement_id_exercise_implements'],
                       axis = 1)

In [39]:
# merge_10.to_csv(r'data/final_data_1.csv'+ '.bz2', header=True, index=False,compression='bz2')


In [17]:
merge_10.head()

Unnamed: 0,id_ex_ex,exercise_id_ex_ex,session_set_execution_id,reps_executed_ex_ex,execution_time_ex_ex,order_ex_ex,created_at_ex_ex,updated_at_ex_ex,id_session_set_ex,order_session_set_ex,...,BMI_category,id_session_blocks,session_id_session_blocks,order_session_blocks,block_type_session_blocks,id_exercise_implements,exercise_id_exercise_implements,implement_id_exercise_implements,id_implements,name_en_implements
0,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,Normal,1849.0,754.0,1,18.0,,,,,
1,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,Normal,1850.0,754.0,2,16.0,,,,,
2,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,Normal,1851.0,754.0,3,18.0,,,,,
3,1279660,5236,47654,10,66,1,2021-10-29 13:07:01.918992,2021-10-29 13:07:01.918992,47654.0,1.0,...,Normal,1852.0,754.0,4,18.0,,,,,
4,1279661,5968,47654,0,15,2,2021-10-29 13:07:01.924569,2021-10-29 13:07:01.924569,47654.0,1.0,...,Normal,1849.0,754.0,1,18.0,,,,,


In [38]:
session_executions2.head()

Unnamed: 0,id,user_program_id,difficulty_feedback,enjoyment_feedback,reps_executed,execution_time,session_id,discarded,discard_reason
0,4201,2016,7.0,4.0,600.0,,659,f,
1,4283,2272,,,,,536,t,4.0
2,4399,2393,4.0,3.0,144.0,,692,f,
3,4850,2791,7.0,5.0,576.0,,644,f,
4,4672,2447,5.0,3.0,174.0,,691,f,


In [30]:
pd.set_option('display.max_columns', None)

In [31]:
merge_10

MemoryError: 

MemoryError: 