In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
import datetime
import seaborn

# Loading the dataset

In [2]:
garmin_csv_df = pd.read_csv('../../data/client_data/raw/garmin/garmin-daily-data-1697722717275.csv', delimiter=',')

In [3]:
garmin_csv_df.head()

Unnamed: 0,id,sleep_value,sleep_target,steps,calories_value,calories_target,createdAt,updatedAt,intensity,min_heart_rate,max_heart_rate,daily_step_goal
0,1,254,480,4210,1731,2500,2023-05-16T17:30:00.456Z,2023-05-16T23:55:00.165Z,,,,
1,2,254,480,4210,1731,2500,2023-05-16T17:30:00.665Z,2023-05-16T23:55:00.346Z,,,,
2,3,254,480,4210,1731,2500,2023-05-16T17:30:00.858Z,2023-05-16T23:55:00.530Z,,,,
3,4,254,480,4210,1731,2500,2023-05-17T00:00:00.236Z,2023-05-17T18:10:00.504Z,,,,
4,5,254,480,4210,1731,2500,2023-05-17T00:00:00.419Z,2023-05-17T23:55:00.194Z,,,,


In [4]:
garmin_csv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               17 non-null     int64  
 1   sleep_value      17 non-null     int64  
 2   sleep_target     17 non-null     int64  
 3   steps            17 non-null     int64  
 4   calories_value   17 non-null     int64  
 5   calories_target  17 non-null     int64  
 6   createdAt        17 non-null     object 
 7   updatedAt        17 non-null     object 
 8   intensity        0 non-null      float64
 9   min_heart_rate   0 non-null      float64
 10  max_heart_rate   0 non-null      float64
 11  daily_step_goal  0 non-null      float64
dtypes: float64(4), int64(6), object(2)
memory usage: 1.7+ KB


# Handling Nans

In [5]:
# Drop columns with all nan values:
garmin_csv_df = garmin_csv_df.drop(columns=['intensity', 'min_heart_rate', 'max_heart_rate', 'daily_step_goal'])
garmin_csv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               17 non-null     int64 
 1   sleep_value      17 non-null     int64 
 2   sleep_target     17 non-null     int64 
 3   steps            17 non-null     int64 
 4   calories_value   17 non-null     int64 
 5   calories_target  17 non-null     int64 
 6   createdAt        17 non-null     object
 7   updatedAt        17 non-null     object
dtypes: int64(6), object(2)
memory usage: 1.2+ KB


# Convert date time fields to datetime data type

In [6]:
garmin_csv_df['createdAt'] = pd.to_datetime(garmin_csv_df['createdAt'])
garmin_csv_df['updatedAt'] = pd.to_datetime(garmin_csv_df['updatedAt'])

In [7]:
garmin_csv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   id               17 non-null     int64              
 1   sleep_value      17 non-null     int64              
 2   sleep_target     17 non-null     int64              
 3   steps            17 non-null     int64              
 4   calories_value   17 non-null     int64              
 5   calories_target  17 non-null     int64              
 6   createdAt        17 non-null     datetime64[ns, UTC]
 7   updatedAt        17 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(6)
memory usage: 1.2 KB


# Exploring data values

In [8]:
garmin_csv_df.describe()

Unnamed: 0,id,sleep_value,sleep_target,steps,calories_value,calories_target
count,17.0,17.0,17.0,17.0,17.0,17.0
mean,16.058824,254.0,480.0,4210.0,1731.0,2500.0
std,17.177276,0.0,0.0,0.0,0.0,0.0
min,1.0,254.0,480.0,4210.0,1731.0,2500.0
25%,5.0,254.0,480.0,4210.0,1731.0,2500.0
50%,9.0,254.0,480.0,4210.0,1731.0,2500.0
75%,13.0,254.0,480.0,4210.0,1731.0,2500.0
max,47.0,254.0,480.0,4210.0,1731.0,2500.0


### ^std shows the same values are recorded for each quantitative field per id

### Just id of the data recorded given, but no userId, email, etc

# Exploring the datetime values

In [9]:
# difference in createdAt values:
garmin_csv_df['createdAt'].diff()

0                       NaT
1    0 days 00:00:00.209000
2    0 days 00:00:00.193000
3    0 days 06:29:59.378000
4    0 days 00:00:00.183000
5    0 days 00:00:00.193000
6    0 days 19:00:00.096000
7    0 days 04:59:59.576000
8    0 days 00:00:00.154000
9    0 days 00:00:00.181000
10   0 days 23:59:59.592000
11   0 days 00:00:00.188000
12   0 days 00:00:00.198000
13   0 days 16:00:00.167000
14   0 days 07:59:59.475000
15   0 days 23:59:59.984000
16   1 days 00:00:00.079000
Name: createdAt, dtype: timedelta64[ns]

### ^ many differences are very small, less than seconds difference, perhaps something linked with syncing?

In [10]:
# calculating the difference in createdAt and updatedAt times
garmin_csv_df['updatedAt'] - garmin_csv_df['createdAt']

0    0 days 06:24:59.709000
1    0 days 06:24:59.681000
2    0 days 06:24:59.672000
3    0 days 18:10:00.268000
4    0 days 23:54:59.775000
5    0 days 23:54:59.732000
6    0 days 04:54:59.800000
7    0 days 23:54:59.971000
8    0 days 23:54:59.992000
9    0 days 23:54:59.992000
10   0 days 16:30:00.059000
11   0 days 16:30:00.066000
12   0 days 16:30:00.109000
13   0 days 07:54:59.765000
14   0 days 23:55:00.265000
15   0 days 23:54:59.958000
16   0 days 07:00:00.014000
dtype: timedelta64[ns]

In [11]:
# Mean differnce:
(garmin_csv_df['updatedAt'] - garmin_csv_df['createdAt']).mean()

Timedelta('0 days 16:07:38.754588235')

# Checking JSON file provides the same information

In [12]:
# (requires removing the 'data' part of the json file to leave only the records)
json_df = pd.read_json('../../data/client_data/raw/garmin/garmin-daily-data-1697722712474 (1).json', orient='records')
json_df.head()

Unnamed: 0,id,sleep_value,sleep_target,steps,calories_value,calories_target,createdAt,updatedAt,intensity,min_heart_rate,max_heart_rate,daily_step_goal,user
0,46,254,480,4210,1731,2500,2023-05-21T00:00:00.223Z,2023-05-21T23:55:00.181Z,,,,,
1,1,254,480,4210,1731,2500,2023-05-16T17:30:00.456Z,2023-05-16T23:55:00.165Z,,,,,
2,2,254,480,4210,1731,2500,2023-05-16T17:30:00.665Z,2023-05-16T23:55:00.346Z,,,,,
3,3,254,480,4210,1731,2500,2023-05-16T17:30:00.858Z,2023-05-16T23:55:00.530Z,,,,,
4,5,254,480,4210,1731,2500,2023-05-17T00:00:00.419Z,2023-05-17T23:55:00.194Z,,,,,


In [13]:
garmin_csv_df_original = pd.read_csv('../../data/client_data/raw/garmin/garmin-daily-data-1697722717275.csv', delimiter=',')


In [14]:
json_df = json_df.sort_values(by=['id'])
garmin_csv_df_original = garmin_csv_df_original.sort_values(by=['id'])

json_df = json_df.set_index('id')
garmin_csv_df_original = garmin_csv_df_original.set_index('id')

json_df = json_df.dropna(axis=1, how='all')
garmin_csv_df_original = garmin_csv_df_original.dropna(axis=1, how='all')

In [15]:
json_df.equals(garmin_csv_df)

False

# Saving cleaned and preprocessed dataset

In [16]:
garmin_csv_df.to_csv('../../data/client_data/cleaned/garmin/cleaned_garmin.csv', index=False)
