## L'utilisation quotidienne d'une montre connectée

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import datetime as dt
#pd.set_option('max_column')

In [4]:
df = pd.read_csv('dailyActivity_merged.csv')

In [7]:
# Exploration des données

df.shape

(940, 15)

In [9]:
# des colonnes

df.columns

Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
       'LoggedActivitiesDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
      dtype='object')

In [11]:
# 5 premières lignes des données

df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [13]:
df.dtypes

Id                            int64
ActivityDate                 object
TotalSteps                    int64
TotalDistance               float64
TrackerDistance             float64
LoggedActivitiesDistance    float64
VeryActiveDistance          float64
ModeratelyActiveDistance    float64
LightActiveDistance         float64
SedentaryActiveDistance     float64
VeryActiveMinutes             int64
FairlyActiveMinutes           int64
LightlyActiveMinutes          int64
SedentaryMinutes              int64
Calories                      int64
dtype: object

In [17]:
# On nettoie les données
# On reformate ID parce que c'est un int dans notre que ce n'est pas approprié car aucun calcul sera effectuer à partir de cette colonne
# On reformate ActivityDate pour en faire une date

df['Id'] = df['Id'].astype(str)

df['ActivityDate'] = pd.to_datetime(df['ActivityDate'], format='%m/%d/%Y')

df.dtypes

Id                                  object
ActivityDate                datetime64[ns]
TotalSteps                           int64
TotalDistance                      float64
TrackerDistance                    float64
LoggedActivitiesDistance           float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
dtype: object

In [25]:
# Les colonnes TotalDistance et TrackerDistance semblent être les mêmes 
# On va créer une nouvelle colonne et soustraire les deux colonnes. Si les résultats sont 0 cela signifie que les colonnes possèdent les mêmes valeurs

df['distanceDiff'] = df['TotalDistance'] - df['TrackerDistance']

In [29]:
# Les colonnes sont differentes

df['distanceDiff'].value_counts()

distanceDiff
0.000000    925
1.830000      1
0.190001      1
0.040000      1
0.810000      1
1.049999      1
0.760000      1
1.070000      1
0.980000      1
0.900001      1
1.140000      1
1.160000      1
0.880000      1
0.460000      1
1.160000      1
1.060000      1
Name: count, dtype: int64

In [33]:
df.columns = df.columns.str.lower()
df.columns

Index(['id', 'activitydate', 'totalsteps', 'totaldistance', 'trackerdistance',
       'loggedactivitiesdistance', 'veryactivedistance',
       'moderatelyactivedistance', 'lightactivedistance',
       'sedentaryactivedistance', 'veryactiveminutes', 'fairlyactiveminutes',
       'lightlyactiveminutes', 'sedentaryminutes', 'calories', 'distancediff',
       'distancediff'],
      dtype='object')

In [43]:
# On renomme les colonnes

df.rename(columns = {'activitydate': 'activity_date', 'totalsteps': 'total_steps', 'totaldistance': 'total_distance', 'trackerdistance': 'tracker_distance',
       'loggedactivitiesdistance': 'logged_activities_distance', 'veryactivedistance': 'very_active_distance',
       'moderatelyactivedistance': 'moderately_active_distance', 'lightactivedistance': 'light_active_distance',
       'sedentaryactivedistance': 'sedentary_active_distance', 'veryactiveminutes': 'very_active_minutes', 'fairlyactiveminutes': 'fairly_active_minutes',
       'lightlyactiveminutes': 'lightly_active_minutes', 'sedentaryminutes': 'sedentary_minutes'}, inplace=True)

df.columns

Index(['id', 'activity_date', 'total_steps', 'total_distance',
       'tracker_distance', 'logged_activities_distance',
       'very_active_distance', 'moderately_active_distance',
       'light_active_distance', 'sedentary_active_distance',
       'very_active_minutes', 'fairly_active_minutes',
       'lightly_active_minutes', 'sedentary_minutes', 'calories',
       'distancediff', 'distancediff'],
      dtype='object')

In [51]:
# Creation d'une colonne

day_of_week = df['activity_date'].dt.day_name()

df['day_of_week'] = day_of_week

df['n_day_of_week'] = df ['activity_date'].dt.weekday # 0 monday 6 sunday

In [53]:
df.head()

Unnamed: 0,id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories,distancediff,distancediff.1,day_of_week,n_day_of_week
0,1503960366,2016-04-12,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985,0.0,0.0,Tuesday,1
1,1503960366,2016-04-13,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797,0.0,0.0,Wednesday,2
2,1503960366,2016-04-14,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776,0.0,0.0,Thursday,3
3,1503960366,2016-04-15,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745,0.0,0.0,Friday,4
4,1503960366,2016-04-16,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863,0.0,0.0,Saturday,5


In [55]:
# On va vérifier les valeurs null

df.isna()

Unnamed: 0,id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories,distancediff,distancediff.1,day_of_week,n_day_of_week
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
936,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
937,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
938,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [57]:
# On va vérifier les valeurs null

df.isna().sum()


id                            0
activity_date                 0
total_steps                   0
total_distance                0
tracker_distance              0
logged_activities_distance    0
very_active_distance          0
moderately_active_distance    0
light_active_distance         0
sedentary_active_distance     0
very_active_minutes           0
fairly_active_minutes         0
lightly_active_minutes        0
sedentary_minutes             0
calories                      0
distancediff                  0
distancediff                  0
day_of_week                   0
n_day_of_week                 0
dtype: int64

In [59]:
# On checke les doublons

df.duplicated().sum()

0

In [61]:
# On va choisir seulement quelques colonnes pour effectuer notre analyse

df_sub = df[['id', 'activity_date', 'total_steps', 'sedentary_active_distance',
       'very_active_minutes', 'fairly_active_minutes',
       'lightly_active_minutes', 'sedentary_minutes', 'calories']].copy()

df_sub.head(3)

Unnamed: 0,id,activity_date,total_steps,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories
0,1503960366,2016-04-12,13162,0.0,25,13,328,728,1985
1,1503960366,2016-04-13,10735,0.0,21,19,217,776,1797
2,1503960366,2016-04-14,10460,0.0,30,11,181,1218,1776


In [65]:
# Analyse
#Sédentaire si le nombre de pas < 5000 en moyenne
# actif régulier si pas = 8000 et super actif si pas >=10000 en moyenne

df_sub['id'].unique()


array(['1503960366', '1624580081', '1644430081', '1844505072',
       '1927972279', '2022484408', '2026352035', '2320127002',
       '2347167796', '2873212765', '3372868164', '3977333714',
       '4020332650', '4057192912', '4319703577', '4388161847',
       '4445114986', '4558609924', '4702921684', '5553957443',
       '5577150313', '6117666160', '6290855005', '6775888955',
       '6962181067', '7007744171', '7086361926', '8053475328',
       '8253242879', '8378563200', '8583815059', '8792009665',
       '8877689391'], dtype=object)

In [67]:
id_grp = df.groupby(['id'])
id_avg_step = id_grp['total_steps'].mean().sort_values(ascending=False)
id_avg_step = id_avg_step.to_frame()
id_avg_step

Unnamed: 0_level_0,total_steps
id,Unnamed: 1_level_1
8877689391,16040.032258
8053475328,14763.290323
1503960366,12116.741935
2022484408,11370.645161
7007744171,11323.423077
3977333714,10984.566667
4388161847,10813.935484
6962181067,9794.806452
2347167796,9519.666667
7086361926,9371.774194
