##### This notebook preprocesses the initial training set and creates a cleaned version of the training set to be provided to the clustering algorithm.
In more detail it contains the following steps:
1. It encodes features (e.g., with one-hot encoding and binary-encoding) that contain series as values
2. It explores the encoded dataframe to remove unnecessary rows and columns
3. It runs dataprep and identifies basic preprocessing
4. It implements basic preprocessing actions on the corresponding features
5. It implements preprocessing on the different features' granularity
6. It runs dataprep, identifies and reports the preprocessing actions need per each feature individually
7. It drops features that exceed a specific NaN threshold and creates some representatives
8. It drops records that exceed a specific NaN threshold
9. It implements the preprocessing actions identified on step 4
10. It implements additional preprocessing and engineering actions


In [33]:
import time
import warnings
import numpy as np
import pandas as pd
from dataprep.eda import create_report
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.append('../')
from functions import training_set_preprocessing

warnings.filterwarnings("ignore")

Load the initial training set

In [28]:
training_df = pd.read_pickle('../data/preprocessing_final/training_df_hourly_unprocessed.pkl')
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,badge_type,badge_value,calories,distance,...,minutes_after_wakeup,time_in_bed,sleep_efficiency,main_sleep,consciousness_raising_category,counterconditioning_category,helping_relationships_category,stimulus_control_category,step_goal,place
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,,,27.35,19650.0,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,,,2.35,0.0,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,,,44.50,26880.0,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,,,46.06,32050.0,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,,,2.35,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,WORK/SCHOOL
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,HOME
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,TRANSIT
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,HOME


##### 1. Encode features that contain series as values

Find the features that contain series and arrays as values

In [30]:
preprocessing_features = training_set_preprocessing.features_to_encode(training_df)
preprocessing_features

The features that need encoding are: 19


['badge_type',
 'badge_value',
 'exercise',
 'exercise_calories',
 'exercise_avg_hr',
 'exercise_duration',
 'exercise_steps',
 'exercise_sedentary_minutes',
 'exercise_lightly_minutes',
 'exercise_fairly_minutes',
 'exercise_very_minutes',
 'exercise_Out of Range_zone_minutes',
 'exercise_Out of Range_zone_calories',
 'exercise_Fat Burn_zone_minutes',
 'exercise_Fat Burn_zone_calories',
 'exercise_Cardio_zone_minutes',
 'exercise_Cardio_zone_calories',
 'exercise_Peak_zone_minutes',
 'exercise_Peak_zone_calories']

Preprocessing for badge_type and badge_value

In [8]:
# encode a pair of categorical-numerical features
df_badges = training_df[['badge_type', 'badge_value']]
new_df = pd.concat([training_set_preprocessing.encode_row(row, list(df_badges.columns)) for _, row in df_badges.iterrows()], axis=1).T

# merge with the final dataframe and drop unnecessary columns
training_df = pd.concat([training_df, new_df], axis=1)
training_df.drop(columns=['badge_type', 'badge_value'], inplace=True)
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,helping_relationships_category,stimulus_control_category,step_goal,place,LIFETIME_DISTANCE,LIFETIME_FLOORS,DAILY_STEPS,DAILY_FLOORS,LIFETIME_WEIGHT_GOAL_SETUP,GOAL_BASED_WEIGHT_LOSS
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,WORK/SCHOOL,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,HOME,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,TRANSIT,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,HOME,,,,,,


Preprocessing for exercise features (17)

In [9]:
# calculate the sum and avg of specific features
exercise_df = training_df.loc[:, 'exercise':'exercise_Peak_zone_calories']
sum_features = list(exercise_df.loc[:, 'exercise_steps': 'exercise_Peak_zone_calories'].columns)
sum_features.append('exercise_calories')
for feature in sum_features:
    exercise_df[feature] = exercise_df[feature].apply(lambda d: training_set_preprocessing.replace_with_sum(d))
exercise_df['exercise_avg_hr'] = exercise_df['exercise_avg_hr'].apply(lambda d: training_set_preprocessing.replace_with_avg(d))

# encode a pair of categorical-numerical features
encoding_df = exercise_df[['exercise', 'exercise_duration']]
new_df = pd.concat([training_set_preprocessing.encode_row(row, list(encoding_df.columns)) for _, row in encoding_df.iterrows()], axis=1).T

# merge with the final dataframe and drop unnecessary columns
exercise_df = pd.concat([exercise_df, new_df], axis=1)
exercise_df.drop(columns=['exercise', 'exercise_duration'], inplace=True)
delete_features = list(training_df.loc[:, 'exercise':'exercise_Peak_zone_calories'].columns)
training_df.drop(columns=delete_features, inplace=True)
training_df = pd.concat([training_df, exercise_df], axis=1)
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [10]:
training_df.to_pickle('../data/preprocessing_temps/encoded_training_df.pkl')

##### 2. Explore the encoded dataframe to remove unnecessary rows and columns
- Does the encoded dataframe contains duplicates?
- How many users do they participate in the training process based on the encoded dataframe?
- Does the encoded dataframe contains rows only with NaNs?
- Does the encoded dataframe contains columns only with NaNs?
- How many records derived out of official experiment days and do they affect this work?
- Concatenate the date and hour columns into one datetime object column.

In [2]:
training_df = pd.read_pickle('../data/preprocessing_temps/encoded_training_df.pkl')
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [3]:
# check for duplicates based on all features
# Finding: 1 exact duplicate has been found - when all features were present there were not duplicates, but after features splitting this one duplicate has appeared.
# Approach: Drop them
training_df.drop_duplicates(inplace=True)
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [4]:
# find the users that are participating in the training
print(training_df['id'].nunique(), "users are participating in the training process.")

71 users are participating in the training process.


In [5]:
# find rows with no information - 5164 such rows have been found
columns = list(training_df.loc[:, 'sleep_points':].columns)
cleaned_training_df = training_df[~training_df[columns].isnull().all(1)]
cleaned_training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [6]:
# find columns with no information - as expected 0 such columns have been found
for col in columns:
    if cleaned_training_df[col].isnull().all():
        cleaned_training_df.drop(columns=col, inplace=True)
cleaned_training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [7]:
# find how many records derived out of the official experiment dates and if affect this work
training_set_preprocessing.experiments_dates(cleaned_training_df)

There are 45225 rows out of experiment dates from 69 unique users with 'normal distribution'.


In [48]:
# concatenate the date and hour columns into one datetime object column
cleaned_training_df['date'] = pd.to_datetime(cleaned_training_df['date']) + cleaned_training_df['hour'] * pd.to_timedelta('1H')
cleaned_training_df.drop(columns=['hour'], inplace=True)
cleaned_training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
164878,621e375b67b776a240290cdc,2021-01-06 15:00:00,,,,,,,,,...,,,,,,,,,,
164960,621e2f6167b776a240e082a9,2021-01-06 20:00:00,,,,,,,,,...,,,,,,,,,,
164856,621e301367b776a24057738e,2021-01-06 23:00:00,,,,,,,,,...,,,,,,,,,,
164874,621e362467b776a2404ad513,2021-01-08 13:00:00,,,,,,,,,...,,,,,,,,,,
164858,621e30c867b776a240d4aa6c,2021-01-08 22:00:00,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56523,621e301e67b776a240608a72,2022-01-22 00:00:00,,,,0.9,,0.0,,0.0,...,,,,,,,,,,
164964,621e310d67b776a24003096d,2022-01-24 14:00:00,,,,,,,,,...,,,,,,,,,,
164870,621e346f67b776a24081744f,2022-01-24 10:00:00,,,,,,,,,...,,,,,,,,,,
164855,621e300767b776a2404dc717,2022-01-24 12:00:00,,,,,,,,,...,,,,,,,,,,


In [49]:
cleaned_training_df.to_pickle('../data/preprocessing_temps/cleaned_encoded_training_df.pkl')

##### 3. Run dataprep and identify basic preprocessing

In [2]:
training_df = pd.read_pickle('../data/preprocessing_temps/cleaned_encoded_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
164878,621e375b67b776a240290cdc,2021-01-06 15:00:00,,,,,,,,,...,,,,,,,,,,
164960,621e2f6167b776a240e082a9,2021-01-06 20:00:00,,,,,,,,,...,,,,,,,,,,
164856,621e301367b776a24057738e,2021-01-06 23:00:00,,,,,,,,,...,,,,,,,,,,
164874,621e362467b776a2404ad513,2021-01-08 13:00:00,,,,,,,,,...,,,,,,,,,,
164858,621e30c867b776a240d4aa6c,2021-01-08 22:00:00,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56523,621e301e67b776a240608a72,2022-01-22 00:00:00,,,,0.9,,0.0,,0.0,...,,,,,,,,,,
164964,621e310d67b776a24003096d,2022-01-24 14:00:00,,,,,,,,,...,,,,,,,,,,
164870,621e346f67b776a24081744f,2022-01-24 10:00:00,,,,,,,,,...,,,,,,,,,,
164855,621e300767b776a2404dc717,2022-01-24 12:00:00,,,,,,,,,...,,,,,,,,,,


In [3]:
report = create_report(training_df, title='Training Dataframe')
report.show_browser()

  0%|          | 0/17571 [00:00<?, ?it/s]

##### 4. Implement basic preprocessing

step_goal preprocessing

In [58]:
# convert from categorical to numerical
training_df['step_goal'].replace(to_replace=['NO_GOAL'], value=[0], inplace=True)
training_df['step_goal'] = training_df['step_goal'].astype(float)

main_sleep preprocecssing

In [59]:
# binary encoding
training_df['main_sleep'].replace(to_replace=[True], value=[1], inplace=True)

consciousness_raising, counterconditioning, helping_relationships and stimulus_control preprocessing

In [60]:
# replace with 0,1,2 (ordinal relationship)
for feature in ['consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category']:
    training_df[feature].replace(to_replace=['Below average', 'Average', 'Above average'], value=[0, 1 ,2], inplace=True)

place preprocessing

In [61]:
# one-hot-encoding
s = training_df['place']
dum = pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)
df = pd.concat([s, dum], axis=1)
training_df = pd.concat([training_df, df], axis=1)
training_df = training_df.drop(columns='place')

In [62]:
training_df.to_pickle('../data/preprocessing_temps/basic_preprocessed_training_df.pkl')

##### 5. Preprocess the different features' granularity

In [11]:
start = time.time()
print("Loading and sorting ... ")
training_df = pd.read_pickle('../data/preprocessing_temps/basic_preprocessed_training_df.pkl')
training_df = training_df.sort_values(['id', 'date'], ignore_index=True)
print("finished after", time.time() - start)
training_df

Loading and sorting ... 
finished after 0.22036361694335938


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,,,,2.29,0.0,,,,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,,,,1.09,,,,,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,,,,7.75,1100.0,,,,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,,,,2.51,0.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,,,,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,,,,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,,5.0,,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,,,,...,,,,,,,,,,


Group together features that need the same granularity preprocessing

In [12]:
# if the feature has value for any time in the day copy this values for the whole day
copy_value_for_whole_day = ['exertion_points', 'lightly_active_minutes', 'moderately_active_minutes', 'sedentary_minutes', 'very_active_minutes', 'step_goal', 'minutes_below_zone_1', 'minutes_in_zone_1', 'minutes_in_zone_2', 'minutes_in_zone_3', 'sleep_points', 'water_amount', 'mindfulness_goal'] # 13

In [13]:
start = time.time()
print("Copy value for the whole day ... ")
for _, user_df in training_df.groupby(['id', 'date']):
    for feature in copy_value_for_whole_day:
        non_null_feature = user_df[feature].notnull().any() # if it has value
        if non_null_feature: # if it has value in a day
            training_df.loc[(training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'].dt.date == user_df['date'].iloc[0].date()), feature] = user_df[feature].iloc[0] # copy for the whole day
print("finished after", time.time() - start)
training_df

Copy value for the whole day ... 
finished after 9077.134395360947


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [14]:
training_df.to_pickle('../data/granularities_temps/after_whole_day_training_df.pkl')

In [15]:
# if the feature has value for any time in the day copy this values for the rest duration of the day
copy_value_for_rest_day = ['altitude', 'sleep_duration', 'minutes_to_fall_asleep', 'minutes_asleep', 'minutes_awake', 'minutes_after_wakeup', 'time_in_bed', 'sleep_efficiency', 'main_sleep', 'LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'DAILY_STEPS', 'DAILY_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP', 'GOAL_BASED_WEIGHT_LOSS'] # 15

In [16]:
start = time.time()
print("Copy value for the rest day ... ")
for _, user_df in training_df.groupby(['id', 'date']):
    for feature in copy_value_for_rest_day:
        non_null_feature = user_df[feature].notnull().any()
        if non_null_feature: # if it has value
            training_df.loc[(training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'].dt.date == user_df['date'].iloc[0].date()) & (training_df['date'].dt.hour > user_df['date'].iloc[0].hour), feature] = user_df[feature].iloc[0] # copy for the rest day
print("finished after", time.time() - start)
training_df

Copy value for the rest day ... 
finished after 5413.36278629303


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [17]:
training_df.to_pickle('../data/granularities_temps/after_rest_day_training_df.pkl')

In [18]:
# if the user has answered only once, copy this value for all the records of the user, else if the user has answered twice, copy for the first month the first answer and for the rest of its records copy the second answer
behavior_granularities = ['consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category'] # 4

In [19]:
start = time.time()
print("Copy value for behavior features ... ")
for _, user_df in training_df.groupby(['id']):
    for feature in behavior_granularities:
        non_null_feature = user_df[feature].notnull().any()
        if non_null_feature: # if it has value
            unique_values = user_df[feature].dropna().unique() # find the number of unique values

            if len(unique_values) == 1: # if it has only one value copy the value in all of its records
                training_df.loc[(training_df['id'] == user_df['id'].iloc[0]), feature] = unique_values[0]
            elif len(unique_values) >= 2: # if it has more than one values

                # find the earliest answer
                earliest_non_nan_date = user_df['date'].loc[user_df[feature].notna()].min()
                earliest_value = user_df[feature].loc[user_df['date'] == earliest_non_nan_date].iloc[0]

                if len(unique_values) > 2: # if accidentally user has more than 2 answers, keep the first and the last
                    # find the latest answer
                    latest_non_nan_date = user_df['date'].loc[user_df[feature].notna()].max()
                    latest_value = user_df[feature].loc[user_df['date'] == latest_non_nan_date].iloc[0]
                    unique_values = [earliest_value, latest_value]

                # create a mask for the next 30 days that the first answer will be copied
                next_30_days_mask = (training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'] < earliest_non_nan_date + pd.DateOffset(days=30))
                training_df.loc[next_30_days_mask, feature] = earliest_value

                # copy the second answer in all the other records
                training_df.loc[(training_df['id'] == user_df['id'].iloc[0]), feature] = training_df.loc[(training_df['id'] == user_df['id'].iloc[0]), feature].fillna(unique_values[1])
print("finished after", time.time() - start)
training_df

Copy value for behavior features ... 
finished after 27.345142126083374


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [20]:
training_df.to_pickle('../data/granularities_temps/after_behavior_granularities_df.pkl')

In [21]:
# if the feature has value for any time in the day copy this values for the past duration of the day
copy_value_for_past_day = ['ENTERTAINMENT', 'GYM', 'HOME', 'HOME_OFFICE', 'OTHER', 'OUTDOORS', 'TRANSIT', 'WORK/SCHOOL'] # 8

In [22]:
start = time.time()
print("Copy value for the past day ... ")
for _, user_df in training_df.groupby(['id', 'date']):
    for feature in copy_value_for_past_day:
        non_null_feature = user_df[feature].notnull().any()
        if non_null_feature: # if it has value
            training_df.loc[(training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'].dt.date == user_df['date'].iloc[0].date()) & (training_df['date'].dt.hour < user_df['date'].iloc[0].hour), feature] = user_df[feature].iloc[0] # copy for the past day
print("finished after", time.time() - start)
training_df

Copy value for the past day ... 
finished after 6974.341124296188


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [23]:
training_df.to_pickle('../data/granularities_temps/after_past_day_training_df.pkl')

In [24]:
# if the feature has value copy this values for the duration of the feature
copy_value_for_duration = ['Circuit Training', 'Walk', 'Run', 'Workout', 'Bike', 'Hike', 'Martial Arts', 'Yoga/Pilates', 'Swim', 'Aerobic Workout', 'Tennis', 'Sport', 'Interval Workout', 'Spinning', 'Weights', 'Elliptical', 'Treadmill', 'Bootcamp'] # 18

In [25]:
# if the feature has value copy this values for the duration of the feature
copy_value_when_exercise_has = ['exercise_calories', 'exercise_steps', 'exercise_sedentary_minutes', 'exercise_lightly_minutes', 'exercise_fairly_minutes', 'exercise_very_minutes', 'exercise_avg_hr', 'exercise_Out of Range_zone_minutes', 'exercise_Out of Range_zone_calories', 'exercise_Fat Burn_zone_minutes', 'exercise_Fat Burn_zone_calories', 'exercise_Cardio_zone_minutes', 'exercise_Cardio_zone_calories', 'exercise_Peak_zone_minutes', 'exercise_Peak_zone_calories'] # 15

In [26]:
# divide by 3600000 to convert milliseconds to hours
for feature in copy_value_for_duration:
    training_df[feature] = training_df[feature].apply(lambda d: d/3600000)

In [27]:
start = time.time()
print("Copy value for exercise features ... ")
for _, user_df in training_df.groupby(['id', 'date']):
    for feature in copy_value_for_duration:
        non_null_feature = user_df[feature].notnull().any()
        if non_null_feature: # if it has value
            duration = user_df[feature].values[0]
            integer_part = int(duration)
            decimal_part = round(float(duration % 1),2)

            if decimal_part >= 0.5: # if the decimal part of the hour is more than 0.5 then add one more hour/record
                # create a mask for the previous hours that the value will be copied
                previous_hours_mask = (training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'] >= user_df['date'].iloc[0] - pd.DateOffset(hours=integer_part+1)) & (training_df['date'] < user_df['date'].iloc[0]) # copy for the duration
            else:
                # create a mask for the previous hours that the value will be copied
                previous_hours_mask = (training_df['id'] == user_df['id'].iloc[0]) & (training_df['date'] >= user_df['date'].iloc[0] - pd.DateOffset(hours=integer_part)) & (training_df['date'] < user_df['date'].iloc[0]) # copy for the duration

            training_df.loc[previous_hours_mask, feature] = duration

            for f in copy_value_when_exercise_has: # copy the value of all the other exercise information based on the above mask
                training_df.loc[previous_hours_mask, f] = user_df[f].iloc[0]
print("finished after", time.time() - start)
training_df

Copy value for exercise features ... 
finished after 932.230384349823


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [28]:
training_df.to_pickle('../data/granularities_temps/after_duration_exercise_training_df.pkl')

##### 6. Run dataprep, identify and report the final preprocessing actions

In [4]:
training_df = pd.read_pickle('../data/granularities_temps/after_duration_exercise_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [5]:
report = create_report(training_df, title='Granularity preprocessed')
report.show_browser()

  0%|          | 0/18495 [00:00<?, ?it/s]

##### 7. Drop features that exceed a specific threshold for NaN values and create some representative ones

In [60]:
training_df = pd.read_pickle('../data/granularities_temps/after_duration_exercise_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [61]:
nan_percentages = training_df.isna().mean()
drop_columns = nan_percentages[nan_percentages > 0.8].index.tolist()
if not drop_columns:
    print("No columns have more than 80% NaN values.")
else:
    print("The columns with more than 80% NaN values are:", drop_columns)

The columns with more than 80% NaN values are: ['mindfulness_goal', 'water_amount', 'deep', 'light', 'rem', 'wake', 'sleep_duration', 'minutes_to_fall_asleep', 'minutes_asleep', 'minutes_awake', 'minutes_after_wakeup', 'time_in_bed', 'sleep_efficiency', 'main_sleep', 'LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'DAILY_STEPS', 'DAILY_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP', 'GOAL_BASED_WEIGHT_LOSS', 'exercise_calories', 'exercise_avg_hr', 'exercise_steps', 'exercise_sedentary_minutes', 'exercise_lightly_minutes', 'exercise_fairly_minutes', 'exercise_very_minutes', 'exercise_Out of Range_zone_minutes', 'exercise_Out of Range_zone_calories', 'exercise_Fat Burn_zone_minutes', 'exercise_Fat Burn_zone_calories', 'exercise_Cardio_zone_minutes', 'exercise_Cardio_zone_calories', 'exercise_Peak_zone_minutes', 'exercise_Peak_zone_calories', 'Circuit Training', 'Walk', 'Run', 'Workout', 'Bike', 'Hike', 'Martial Arts', 'Yoga/Pilates', 'Swim', 'Aerobic Workout', 'Tennis', 'Sport', 'Interval Workout', 'Spin

Create the sleep features

In [62]:
""" This function helps at correcting the inconsistencies in sleep features, by removing the values of the sleep_duration, deep, rem, light, wake column if they are between 22:00-23:00. It returns the dataframe where the sleep sessions all start at 00:00:00. """
def remove_sleep_records(group):
    mask = group['date'].dt.hour.between(21, 23)
    group.loc[mask, features] = np.nan
    return group

features = ['deep', 'light', 'rem', 'wake', 'sleep_duration']
training_df = training_df.groupby('id').apply(remove_sleep_records)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


In [63]:
# sleep_duration: how long the user slept last night
""" This function copies the value of the sleep features within the whole day. It returns the dataframe with the sleep feature values for the whole day. """
def copy_day(group):
    first_values = group['sleep_duration'].iloc[0]
    group['sleep_duration'] = first_values
    return group

training_df = training_df.groupby(['id', training_df['date'].dt.date]).apply(copy_day)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,


Create the badges features

In [64]:
# badges: how many badge types the user get this day
badges_df = training_df[['LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'DAILY_STEPS', 'DAILY_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP', 'GOAL_BASED_WEIGHT_LOSS']]
training_df['badges'] = badges_df.count(axis=1)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,,,0
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,,,0
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,,,0
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,,,0
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,0
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,0
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,0
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,,,0


Create the exercise features

In [65]:
# exercises: how many exercise sessions types the user did this day
exercises_df = training_df[['Circuit Training', 'Walk', 'Run', 'Workout', 'Bike', 'Hike', 'Martial Arts', 'Yoga/Pilates', 'Swim', 'Aerobic Workout', 'Tennis', 'Sport', 'Interval Workout', 'Spinning', 'Weights', 'Elliptical', 'Treadmill', 'Bootcamp']]
# find per hour
training_df['exercises'] = exercises_df.count(axis=1)
# find per day
training_df['exercises'] = training_df.groupby(['id', training_df['date'].dt.date])['exercises'].transform('sum')

# exercise_duration: how long the user did exercise this day
# find per hour
training_df['exercise_duration'] = exercises_df.sum(axis=1)
# find per day
training_df['exercise_duration'] = training_df.groupby(['id', training_df['date'].dt.date])['exercise_duration'].transform(lambda x: x.unique().sum())
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,moderately_active_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,,24.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,,24.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,,24.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,,24.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,,24.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,5.0,0.0,...,,,,,,,,0,0,0.000000


Drop the features

In [66]:
drop_columns.remove('sleep_duration')
training_df.drop(columns=drop_columns, inplace=True)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000


In [67]:
training_df.to_pickle('../data/preprocessing_temps/dropped_columns_training_df.pkl')

##### 8. Drop records that exceed a specific threshold for NaN values

In [68]:
training_df = pd.read_pickle('../data/preprocessing_temps/dropped_columns_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000


In [69]:
threshold = int(0.8 * len(training_df.columns))
dropped_records_training_df = training_df[training_df.isnull().sum(axis=1) <= threshold]
dropped_records_training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000


In [70]:
dropped_records_training_df.to_pickle('../data/preprocessing_temps/dropped_rows_training_df.pkl')

##### 9. Implement the final preprocessing

In [71]:
training_df = pd.read_pickle('../data/preprocessing_temps/dropped_rows_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.0,27.0,,16.82,7260.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.0,27.0,,2.29,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.0,27.0,,1.09,,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.0,27.0,,7.75,1100.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.0,27.0,,2.51,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,,,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000


For the features below replace all the NaN and 0 values with the user mean if available, otherwise with the general mean

In [72]:
nan_0_with_mean = ['exertion_points', 'sleep_points'] # 2

In [73]:
training_df = training_set_preprocessing.replace_nan_0_with_mean(training_df, nan_0_with_mean)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,,16.82,7260.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,,2.29,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,,1.09,,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,,7.75,1100.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,,2.51,0.0,149.0,24.0,713.0,...,,,,,,,,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,,0.90,,0.0,0.0,706.0,...,,,,,,,,0,0,0.000000


For the features below replace all the NaN values with 0

In [76]:
nan_with_0 = ['altitude', 'step_goal', 'ENTERTAINMENT', 'GYM', 'HOME', 'HOME_OFFICE', 'OTHER', 'OUTDOORS', 'TRANSIT', 'WORK/SCHOOL', 'badges', 'exercises', 'exercise_duration', 'sleep_duration'] # 14

In [77]:
training_df = training_set_preprocessing.replace_nan_with_0(training_df, nan_with_0)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.0,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.0,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.0,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.0,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000


For the features below replace all the NaN values with the user mean if available, otherwise with the general mean

In [78]:
nan_with_mean = ['distance', 'lightly_active_minutes', 'moderately_active_minutes', 'sedentary_minutes', 'steps', 'very_active_minutes', 'minutes_below_zone_1', 'minutes_in_zone_1', 'minutes_in_zone_2', 'minutes_in_zone_3', 'calories'] # 11

In [79]:
training_df = training_set_preprocessing.replace_nan_with_mean(training_df, nan_with_mean)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,19184.302326,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000


For the features below replace all the NaN values with the most common value of all the other users

In [80]:
nan_with_common = ['consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category'] # 4

In [81]:
training_df = training_set_preprocessing.replace_nan_with_common(training_df, nan_with_common)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,19184.302326,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000


Convert the categorical into numerical features

In [82]:
categoricals = ['consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category', 'ENTERTAINMENT', 'GYM', 'HOME', 'HOME_OFFICE', 'OTHER', 'OUTDOORS', 'TRANSIT', 'WORK/SCHOOL']

In [83]:
for categorical in categoricals:
    training_df[categorical] = training_df[categorical].astype(float)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,19184.302326,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000


In [84]:
training_df.to_pickle('../data/preprocessing_temps/final_preprocessed_training_df.pkl')

In [85]:
report = create_report(training_df, title='Final preprocessed')
report.show_browser()

  0%|          | 0/4934 [00:00<?, ?it/s]

Manual check for NaN values and categorical features in the whole dataframe

In [86]:
# check for NaN values
if training_df.isna().any().any():
    print("There are some NaN values in the dataset.")
else:
    print("There are no NaN values in the whole dataset!")

There are no NaN values in the whole dataset!


In [87]:
# check for categorical features and values
if training_df.applymap(lambda x: isinstance(x, float)).all().all():
    print("There are some categorical values (and probably features) in the dataset.")
else:
    print("There are no categorical values and features in the whole dataset!")

There are no categorical values and features in the whole dataset!


##### 10. Implement the additional preprocessing and engineering actions

In [2]:
training_df = pd.read_pickle('../data/preprocessing_temps/final_preprocessed_training_df.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,badges,exercises,exercise_duration
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,19184.302326,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.000000,149.0,24.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.966944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.000000


Date engineering (calculate day and hour sin and cos transformations, weekdays and weekends, and holidays)

In [3]:
training_df = training_set_preprocessing.date_engineering(training_df)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,WORK/SCHOOL,badges,exercises,exercise_duration,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,25.000000,27.000000,0.0,16.82,7260.000000,149.0,24.0,713.0,...,0.0,0,2,0.966944,0.0,0.0,-0.988468,0.000000,0.151428,1.000000
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,25.000000,27.000000,0.0,2.29,0.000000,149.0,24.0,713.0,...,0.0,0,2,0.966944,0.0,0.0,-0.988468,0.258819,0.151428,0.965926
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,25.000000,27.000000,0.0,1.09,19184.302326,149.0,24.0,713.0,...,0.0,0,2,0.966944,0.0,0.0,-0.988468,0.500000,0.151428,0.866025
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,25.000000,27.000000,0.0,7.75,1100.000000,149.0,24.0,713.0,...,0.0,0,2,0.966944,0.0,0.0,-0.988468,0.707107,0.151428,0.707107
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,25.000000,27.000000,0.0,2.51,0.000000,149.0,24.0,713.0,...,0.0,0,2,0.966944,0.0,0.0,-0.988468,0.866025,0.151428,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0,0,0.000000,0.0,0.0,-0.299363,0.965926,-0.954139,-0.258819
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0,0,0.000000,0.0,0.0,-0.299363,0.866025,-0.954139,-0.500000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0,0,0.000000,0.0,0.0,-0.299363,0.707107,-0.954139,-0.707107
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,21.560606,30.378788,0.0,0.90,24201.239049,0.0,0.0,706.0,...,0.0,0,0,0.000000,0.0,0.0,-0.299363,0.500000,-0.954139,-0.866025


In [4]:
training_df.to_pickle('../data/preprocessing_temps/date_engineered_training_df.pkl')

Normalize the dataframe

In [5]:
training_df = pd.read_pickle('../data/preprocessing_temps/date_engineered_training_df.pkl')

In [6]:
min_max_scaler = MinMaxScaler()
features = list(training_df.columns)
features.remove('id')
features.remove('date')
for feature in features:
    training_df[[feature]] = min_max_scaler.fit_transform(training_df[[feature]])
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,WORK/SCHOOL,badges,exercises,exercise_duration,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,0.810469,0.622928,0.0,0.029382,0.008276,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.500000,0.574623,1.000000
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,0.810469,0.622928,0.0,0.002914,0.000000,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.629410,0.574623,0.982963
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,0.810469,0.622928,0.0,0.000729,0.021869,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.750000,0.574623,0.933013
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,0.810469,0.622928,0.0,0.012860,0.001254,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.853553,0.574623,0.853553
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,0.810469,0.622928,0.0,0.003315,0.000000,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.933013,0.574623,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.982963,0.020417,0.370590
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.933013,0.020417,0.250000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.853553,0.020417,0.146447
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.750000,0.020417,0.066987


In [7]:
training_df.to_pickle('../data/preprocessing_temps/preprocessed_training_df_hourly.pkl')