In [1]:
import pandas as pd
import pandas_profiling
import glob


In [5]:


def get_json_to_df(file_list = []):
    df_list = []
    for json_file in file_list:
        df_list.append(pd.read_json(json_file))
    df = pd.concat(df_list)
    return df

def merge_dataframes(df1,df2):
    merged = pd.merge(df1, df2,how='outer', on='dateTime')
    return merged

def make_new_df_value(x='',column_name=''):
    try:
        x = x[column_name]
    except Exception as e:
        print(e)
        x = 0.0
    return x


In [6]:
## Creating lists of all the respective files in the directory
heart_rate_file_list = glob.glob('data/user-site-export/heart_rate-*')
steps_file_list = glob.glob('data/user-site-export/steps-*')
altitude_file_list = glob.glob('data/user-site-export/altitude-*')
calories_file_list = glob.glob('data/user-site-export/calories-*')

In [7]:
## reading json into dataframes
heart_rate_df = get_json_to_df(file_list = heart_rate_file_list).reset_index()
## Heart rate contains a sub json that are explicitly converted into column
heart_rate_df['bpm'] = heart_rate_df['value'].transform(lambda x: make_new_df_value(x,'bpm'))
heart_rate_df['confidence'] = heart_rate_df['value'].transform(lambda x: make_new_df_value(x,'confidence'))
heart_rate_df = heart_rate_df.drop(['value','index'],axis=1)


steps_df = get_json_to_df(file_list = steps_file_list).rename(columns={'value': 'steps'})

altitude_df = get_json_to_df(file_list = altitude_file_list).rename(columns={'value': 'altitude'})
calories_df = get_json_to_df(file_list = calories_file_list).rename(columns={'value': 'calories'})


'float' object is not subscriptable
'float' object is not subscriptable


In [8]:
merged = merge_dataframes(heart_rate_df,steps_df)
merged = merge_dataframes(merged,altitude_df)
merged = merge_dataframes(merged,calories_df)

merged.to_csv('merged_export_data.csv')


# Exploratory Data Analysis

In [None]:
pandas_profiling.ProfileReport(merged)

0,1
Number of variables,6
Number of observations,1151504
Total Missing (%),50.2%
Total size in memory,61.5 MiB
Average record size in memory,56.0 B

0,1
Numeric,5
Categorical,0
Boolean,0
Date,1
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),99.9%
Missing (n),1150127
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,14.771
Minimum,10
Maximum,70
Zeros (%),0.0%

0,1
Minimum,10
5-th percentile,10
Q1,10
Median,10
Q3,20
95-th percentile,30
Maximum,70
Range,60
Interquartile range,10

0,1
Standard deviation,8.148
Coef of variation,0.55161
Kurtosis,6.416
Mean,14.771
MAD,6.3824
Skewness,2.2009
Sum,20340
Variance,66.39
Memory size,17.6 MiB

Value,Count,Frequency (%),Unnamed: 3
10.0,921,0.1%,
20.0,311,0.0%,
30.0,109,0.0%,
40.0,23,0.0%,
50.0,7,0.0%,
60.0,5,0.0%,
70.0,1,0.0%,
(Missing),1150127,99.9%,

Value,Count,Frequency (%),Unnamed: 3
10.0,921,0.1%,
20.0,311,0.0%,
30.0,109,0.0%,
40.0,23,0.0%,
50.0,7,0.0%,

Value,Count,Frequency (%),Unnamed: 3
30.0,109,0.0%,
40.0,23,0.0%,
50.0,7,0.0%,
60.0,5,0.0%,
70.0,1,0.0%,

0,1
Distinct count,154
Unique (%),0.0%
Missing (%),12.8%
Missing (n),147360
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,73.322
Minimum,0
Maximum,194
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,54
Q1,60
Median,67
Q3,79
95-th percentile,115
Maximum,194
Range,194
Interquartile range,19

0,1
Standard deviation,19.262
Coef of variation,0.2627
Kurtosis,1.6722
Mean,73.322
MAD,14.68
Skewness,1.4667
Sum,73625000
Variance,371.01
Memory size,17.6 MiB

Value,Count,Frequency (%),Unnamed: 3
59.0,41594,3.6%,
58.0,40967,3.6%,
60.0,40685,3.5%,
61.0,39332,3.4%,
57.0,38797,3.4%,
62.0,38543,3.3%,
63.0,38336,3.3%,
64.0,36457,3.2%,
65.0,34259,3.0%,
56.0,33678,2.9%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1,0.0%,
36.0,2,0.0%,
37.0,24,0.0%,
38.0,70,0.0%,
39.0,53,0.0%,

Value,Count,Frequency (%),Unnamed: 3
187.0,1,0.0%,
190.0,1,0.0%,
191.0,2,0.0%,
193.0,1,0.0%,
194.0,1,0.0%,

0,1
Distinct count,166
Unique (%),0.0%
Missing (%),85.8%
Missing (n),988073
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.0931
Minimum,1.16
Maximum,15.72
Zeros (%),0.0%

0,1
Minimum,1.16
5-th percentile,1.16
Q1,1.16
Median,1.27
Q3,1.5
95-th percentile,6.47
Maximum,15.72
Range,14.56
Interquartile range,0.34

0,1
Standard deviation,1.7967
Coef of variation,0.85841
Kurtosis,3.9207
Mean,2.0931
MAD,1.3099
Skewness,2.1591
Sum,342070
Variance,3.2283
Memory size,17.6 MiB

Value,Count,Frequency (%),Unnamed: 3
1.16,74949,6.5%,
1.27,17061,1.5%,
1.39,15160,1.3%,
1.5,13239,1.1%,
1.28,2683,0.2%,
3.0,2183,0.2%,
1.51,1957,0.2%,
1.73,1926,0.2%,
3.24,1911,0.2%,
3.7,1598,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1.16,74949,6.5%,
1.27,17061,1.5%,
1.28,2683,0.2%,
1.39,15160,1.3%,
1.5,13239,1.1%,

Value,Count,Frequency (%),Unnamed: 3
13.64,1,0.0%,
13.87,1,0.0%,
14.33,1,0.0%,
14.79,1,0.0%,
15.72,1,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),12.8%
Missing (n),147360
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.8589
Minimum,0
Maximum,3
Zeros (%),3.2%

0,1
Minimum,0
5-th percentile,1
Q1,1
Median,2
Q3,3
95-th percentile,3
Maximum,3
Range,3
Interquartile range,2

0,1
Standard deviation,0.92028
Coef of variation,0.49506
Kurtosis,-1.3233
Mean,1.8589
MAD,0.81351
Skewness,0.0026754
Sum,1866600
Variance,0.84691
Memory size,17.6 MiB

Value,Count,Frequency (%),Unnamed: 3
1.0,396537,34.4%,
3.0,327867,28.5%,
2.0,243240,21.1%,
0.0,36500,3.2%,
(Missing),147360,12.8%,

Value,Count,Frequency (%),Unnamed: 3
0.0,36500,3.2%,
1.0,396537,34.4%,
2.0,243240,21.1%,
3.0,327867,28.5%,

Value,Count,Frequency (%),Unnamed: 3
0.0,36500,3.2%,
1.0,396537,34.4%,
2.0,243240,21.1%,
3.0,327867,28.5%,

0,1
Distinct count,1151504
Unique (%),100.0%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2018-10-17 00:00:00
Maximum,2019-02-07 11:50:00

0,1
Distinct count,134
Unique (%),0.0%
Missing (%),89.7%
Missing (n),1033430
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.5991
Minimum,0
Maximum,161
Zeros (%),8.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,45
Maximum,161
Range,161
Interquartile range,0

0,1
Standard deviation,19.825
Coef of variation,3.0042
Kurtosis,15.037
Mean,6.5991
MAD,10.829
Skewness,3.8271
Sum,779180
Variance,393.03
Memory size,17.6 MiB

Value,Count,Frequency (%),Unnamed: 3
0.0,96349,8.4%,
7.0,951,0.1%,
8.0,935,0.1%,
6.0,808,0.1%,
9.0,771,0.1%,
10.0,685,0.1%,
11.0,576,0.1%,
12.0,554,0.0%,
14.0,533,0.0%,
4.0,518,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,96349,8.4%,
1.0,97,0.0%,
2.0,77,0.0%,
3.0,46,0.0%,
4.0,518,0.0%,

Value,Count,Frequency (%),Unnamed: 3
142.0,1,0.0%,
146.0,2,0.0%,
159.0,2,0.0%,
160.0,2,0.0%,
161.0,1,0.0%,

Unnamed: 0,dateTime,bpm,confidence,steps,altitude,calories
0,2018-12-26 23:00:01,64.0,3.0,,,
1,2018-12-26 23:00:11,65.0,3.0,,,
2,2018-12-26 23:00:16,64.0,3.0,,,
3,2018-12-26 23:00:21,66.0,3.0,,,
4,2018-12-26 23:00:26,67.0,3.0,,,


In [None]:
# The report can also be written to a .html file
profile = pandas_profiling.ProfileReport(merged)
profile.to_file(outputfile="fitbit_data_export_analysis.html")