##### This notebook preprocesses the initial training set and creates a cleaned version of the training set to be provided to the clustering algorithm.
In more detail it contains the following steps:
1. It encodes features (e.g., with one-hot encoding and binary-encoding) that contain series as values
2. It runs dataprep and identifies basic preprocessing
3. It implements basic preprocessing actions on the corresponding features
3. It implements preprocessing on the different features' granularity
4. It identifies and reports the preprocessing actions need per each feature individually
4. It drops features that exceed a specific NaN threshold
5. It drops users that exceed a specific NaN threshold
6. It implements the preprocessing actions identified on step 2


In [57]:
import warnings
import numpy as np
import pandas as pd
from dataprep.eda import create_report
from functions import training_set_preprocessing

warnings.filterwarnings("ignore")

Load the initial training set

In [63]:
training_df = pd.read_pickle('../data/training_df.pkl')
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,badge_type,badge_value,calories,distance,...,minutes_after_wakeup,time_in_bed,sleep_efficiency,main_sleep,consciousness_raising_category,counterconditioning_category,helping_relationships_category,stimulus_control_category,step_goal,place
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,,,27.35,19650.0,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,,,2.35,0.0,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,,,44.50,26880.0,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,,,46.06,32050.0,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,,,2.35,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,WORK/SCHOOL
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,HOME
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,TRANSIT
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,HOME


##### 1. Encode features that contain series as values

Find the features that contain series and arrays as values

In [64]:
features = list(training_df.columns)
del features[0:3]
preprocessing_features = []
for feature in features:
    if training_df[feature].apply(lambda d: True if isinstance(d, np.ndarray) or isinstance(d, pd.Series) else False).any():
        preprocessing_features.append(feature)
print(len(preprocessing_features))
preprocessing_features

19


['badge_type',
 'badge_value',
 'exercise',
 'exercise_calories',
 'exercise_avg_hr',
 'exercise_duration',
 'exercise_steps',
 'exercise_sedentary_minutes',
 'exercise_lightly_minutes',
 'exercise_fairly_minutes',
 'exercise_very_minutes',
 'exercise_Out of Range_zone_minutes',
 'exercise_Out of Range_zone_calories',
 'exercise_Fat Burn_zone_minutes',
 'exercise_Fat Burn_zone_calories',
 'exercise_Cardio_zone_minutes',
 'exercise_Cardio_zone_calories',
 'exercise_Peak_zone_minutes',
 'exercise_Peak_zone_calories']

Preprocessing for badge_type and badge_value

In [65]:
# encode a pair of categorical-numerical features
df_badges = training_df[['badge_type', 'badge_value']]
new_df = pd.concat([training_set_preprocessing.encode_row(row, list(df_badges.columns)) for _, row in df_badges.iterrows()], axis=1).T

# merge with the final dataframe and drop unnecessary columns
training_df = pd.concat([training_df, new_df], axis=1)
training_df.drop(columns=['badge_type', 'badge_value'], inplace=True)
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,helping_relationships_category,stimulus_control_category,step_goal,place,LIFETIME_DISTANCE,LIFETIME_FLOORS,DAILY_STEPS,DAILY_FLOORS,LIFETIME_WEIGHT_GOAL_SETUP,GOAL_BASED_WEIGHT_LOSS
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,WORK/SCHOOL,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,HOME,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,TRANSIT,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,HOME,,,,,,


Preprocessing for exercise features (17)

In [67]:
# calculate the sum and avg of specific features
exercise_df = training_df.loc[:, 'exercise':'exercise_Peak_zone_calories']
sum_features = list(exercise_df.loc[:, 'exercise_steps': 'exercise_Peak_zone_calories'].columns)
sum_features.append('exercise_calories')
for feature in sum_features:
    exercise_df[feature] = exercise_df[feature].apply(lambda d: training_set_preprocessing.replace_with_sum(d))
exercise_df['exercise_avg_hr'] = exercise_df['exercise_avg_hr'].apply(lambda d: training_set_preprocessing.replace_with_avg(d))

# encode a pair of categorical-numerical features
encoding_df = exercise_df[['exercise', 'exercise_duration']]
new_df = pd.concat([training_set_preprocessing.encode_row(row, list(encoding_df.columns)) for _, row in encoding_df.iterrows()], axis=1).T

# merge with the final dataframe and drop unnecessary columns
exercise_df = pd.concat([exercise_df, new_df], axis=1)
exercise_df.drop(columns=['exercise', 'exercise_duration'], inplace=True)
delete_features = list(training_df.loc[:, 'exercise':'exercise_Peak_zone_calories'].columns)
training_df.drop(columns=delete_features, inplace=True)
training_df = pd.concat([training_df, exercise_df], axis=1)
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [68]:
training_df.to_pickle('../data/encoded_training_df.pkl')

##### 2. Run dataprep and identify basic preprocessing

In [58]:
training_df = pd.read_pickle('../data/encoded_training_df.pkl')
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Swim,Aerobic Workout,Tennis,Sport,Interval Workout,Spinning,Weights,Elliptical,Treadmill,Bootcamp
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,


In [4]:
report = create_report(training_df, title='Training Dataframe')
report.show_browser()

  0%|          | 0/17968 [00:00<?, ?it/s]

##### Tests

In [12]:
training_df = training_df.sort_values(by='date', ascending=True)
training_df['date'] = pd.to_datetime(training_df['date'].astype("str"), format='%Y-%m-%d')
experiments_df = training_df.loc[((training_df['date'] > '2021-05-23') & (training_df['date'] < '2021-07-27')) | (
        (training_df['date'] > '2021-11-14') & (training_df['date'] < '2022-01-18'))]
experiments_df.reset_index(inplace=True, drop=True)
extra = pd.concat([training_df,experiments_df]).drop_duplicates(keep=False)
print("There are", len(extra), "rows out of experiment dates from", extra['id'].nunique(), "unique users.")

There are 48230 rows out of experiment dates from 71 unique users.


##### 3. Implement basic preprocessing

step_goal preprocessing

In [59]:
# convert from categorical to numerical
training_df['step_goal'].replace(to_replace=['NO_GOAL'], value=[0], inplace=True)
training_df['step_goal'] = training_df['step_goal'].astype(float)

main_sleep preprocecssing

In [60]:
# binary encoding
training_df['main_sleep'].replace(to_replace=[True], value=[1], inplace=True)

consciousness_raising, counterconditioning, helping_relationships and stimulus_control preprocessing

In [61]:
# replace with 0,1,2 (ordinal relationship)
for feature in ['consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category']:
    training_df[feature].replace(to_replace=['Below average', 'Average', 'Above average'], value=[0, 1 ,2], inplace=True)

place preprocessing

In [62]:
# one-hot-encoding
s = training_df['place']
dum = pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)
df = pd.concat([s, dum], axis=1)
training_df = pd.concat([training_df, df], axis=1)
training_df = training_df.drop(columns='place')

In [63]:
training_df.to_pickle('../data/basic_preprocessed_training_df.pkl')

##### 4. Preprocess the different features' granularity

In [64]:
training_df = pd.read_pickle('../data/basic_preprocessed_training_df.pkl')
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,mindfulness_goal,...,Treadmill,Bootcamp,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,27.35,19650.0,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,2.35,0.0,314.0,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,44.50,26880.0,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,46.06,32050.0,272.0,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,2.35,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Group together features that need the same granularity preprocessing

In [None]:
copy_daily = ['exertion_points', 'lightly_active_minutes ', 'moderately_active_ minutes', 'sedentary_minutes', 'very_active_minutes', 'step_goal', '']