In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [39]:
data_df = pd.read_csv("data/combined_sensor_data_with_features.csv")
data_df.head()

Unnamed: 0,timestamp,Watch_battery,Watch_pressure,Watch_step_counter,Watch_step_detector,Watch_HEART_RATE_X,Watch_HEART_RATE_Y,Watch_ACC_X,Watch_ACC_Y,Watch_ACC_Z,...,Phone_AUDIO_W,Phone_AUDIO_X,Phone_AUDIO_Y,Phone_AUDIO_Z,Phone_ROTATION_VECTOR_A,Phone_ROTATION_VECTOR_B,Phone_ROTATION_VECTOR_C,Phone_ROTATION_VECTOR_D,Phone_ROTATION_VECTOR_E,activity
0,2017-06-29 09:57:00,,,,1.0,,,,,,...,8513.0,7546.236,32767.0,45.0,,,,,,Eat
1,2017-06-29 09:57:03,,,,,79.0,1.0,,,,...,13602.0,7547.238,32767.0,45.0,,,,,,Eat
2,2017-06-29 09:57:08,82.0,,14.0,1.0,,,-10.173126,-1.471237,1.924393,...,5198.0,7548.5195,32767.0,45.0,,,,,,Eat
3,2017-06-29 09:57:09,,964.2831,,,,,,,,...,4686.0,7548.158,32767.0,45.0,,,,,,Eat
4,2017-06-29 09:57:10,,,,,76.0,1.0,,,,...,5239.0,7547.8667,32767.0,45.0,,,,,,Eat


In [40]:
# Convert 'timestamp' column to datetime type
data_df['timestamp'] = pd.to_datetime(data_df['timestamp'])

# Set 'timestamp' as the index for easy manipulation
data_df.set_index('timestamp', inplace=True)

In [41]:
data_df.columns

Index(['Watch_battery', 'Watch_pressure', 'Watch_step_counter',
       'Watch_step_detector', 'Watch_HEART_RATE_X', 'Watch_HEART_RATE_Y',
       'Watch_ACC_X', 'Watch_ACC_Y', 'Watch_ACC_Z', 'Watch_GRAVITY_X',
       'Watch_GRAVITY_Y', 'Watch_GRAVITY_Z', 'Watch_GYRO_X', 'Watch_GYRO_Y',
       'Watch_GYRO_Z', 'Watch_LA_X', 'Watch_LA_Y', 'Watch_LA_Z', 'Watch_MAG_X',
       'Watch_MAG_Y', 'Watch_MAG_Z', 'Watch_ORI_X', 'Watch_ORI_Y',
       'Watch_ORI_Z', 'Watch_ROTATION_VECTOR_A', 'Watch_ROTATION_VECTOR_B',
       'Watch_ROTATION_VECTOR_C', 'Watch_ROTATION_VECTOR_D',
       'Watch_ROTATION_VECTOR_E', 'Glass_ACC_X', 'Glass_ACC_Y', 'Glass_ACC_Z',
       'Glass_GYRO_X', 'Glass_GYRO_Y', 'Glass_GYRO_Z', 'Glass_EOG_L',
       'Glass_EOG_R', 'Glass_EOG_H', 'Glass_EOG_V', 'Phone_battery',
       'Phone_light', 'Phone_pressure', 'Phone_proximity',
       'Phone_step_counter', 'Phone_step_detector', 'Phone_ACC_X',
       'Phone_ACC_Y', 'Phone_ACC_Z', 'Phone_GRAVITY_X', 'Phone_GRAVITY_Y',
       'Pho

In [42]:
pd.DataFrame(data_df.isna().sum())

Unnamed: 0,0
Watch_battery,284301
Watch_pressure,283161
Watch_step_counter,289648
Watch_step_detector,287108
Watch_HEART_RATE_X,266388
...,...
Phone_ROTATION_VECTOR_B,241105
Phone_ROTATION_VECTOR_C,241105
Phone_ROTATION_VECTOR_D,241105
Phone_ROTATION_VECTOR_E,241105


In [43]:
features = ['Watch_battery', 'Watch_pressure', 'Watch_step_counter',
       'Watch_step_detector', 'Watch_HEART_RATE_X', 'Watch_HEART_RATE_Y',
       'Watch_ACC_X', 'Watch_ACC_Y', 'Watch_ACC_Z', 'Watch_GRAVITY_X',
       'Watch_GRAVITY_Y', 'Watch_GRAVITY_Z', 'Watch_GYRO_X', 'Watch_GYRO_Y',
       'Watch_GYRO_Z', 'Watch_LA_X', 'Watch_LA_Y', 'Watch_LA_Z', 'Watch_MAG_X',
       'Watch_MAG_Y', 'Watch_MAG_Z', 'Watch_ORI_X', 'Watch_ORI_Y',
       'Watch_ORI_Z', 'Watch_ROTATION_VECTOR_A', 'Watch_ROTATION_VECTOR_B',
       'Watch_ROTATION_VECTOR_C', 'Watch_ROTATION_VECTOR_D',
       'Watch_ROTATION_VECTOR_E', 'Glass_ACC_X', 'Glass_ACC_Y', 'Glass_ACC_Z',
       'Glass_GYRO_X', 'Glass_GYRO_Y', 'Glass_GYRO_Z', 'Glass_EOG_L',
       'Glass_EOG_R', 'Glass_EOG_H', 'Glass_EOG_V', 'Phone_battery',
       'Phone_light', 'Phone_pressure', 'Phone_proximity',
       'Phone_step_counter', 'Phone_step_detector', 'Phone_ACC_X',
       'Phone_ACC_Y', 'Phone_ACC_Z', 'Phone_GRAVITY_X', 'Phone_GRAVITY_Y',
       'Phone_GRAVITY_Z', 'Phone_GYRO_X', 'Phone_GYRO_Y', 'Phone_GYRO_Z',
       'Phone_LA_X', 'Phone_LA_Y', 'Phone_LA_Z', 'Phone_MAG_X', 'Phone_MAG_Y',
       'Phone_MAG_Z', 'Phone_ORI_X', 'Phone_ORI_Y', 'Phone_ORI_Z',
       'Phone_AUDIO_W', 'Phone_AUDIO_X', 'Phone_AUDIO_Y', 'Phone_AUDIO_Z',
       'Phone_ROTATION_VECTOR_A', 'Phone_ROTATION_VECTOR_B',
       'Phone_ROTATION_VECTOR_C', 'Phone_ROTATION_VECTOR_D',
       'Phone_ROTATION_VECTOR_E']

In [44]:
for feature in features:
    # Forward fill missing values for the next 60 seconds
    data_df[feature] = data_df[feature].fillna(method='ffill', limit=60)
    
    data_df[feature] = data_df[feature].fillna(method='bfill', limit=60)

In [45]:
# Find missing values

In [46]:
missing_values = data_df.isna().mean()*100
missing_values

Watch_battery              87.815149
Watch_pressure             87.967219
Watch_step_counter         92.933571
Watch_step_detector        92.041795
Watch_HEART_RATE_X         87.815837
                             ...    
Phone_ROTATION_VECTOR_B    13.814612
Phone_ROTATION_VECTOR_C    13.814612
Phone_ROTATION_VECTOR_D    13.814612
Phone_ROTATION_VECTOR_E    13.814612
activity                    0.000000
Length: 73, dtype: float64

In [47]:
missing_values[missing_values>30]

Watch_battery              87.815149
Watch_pressure             87.967219
Watch_step_counter         92.933571
Watch_step_detector        92.041795
Watch_HEART_RATE_X         87.815837
Watch_HEART_RATE_Y         87.815837
Watch_ACC_X                87.967219
Watch_ACC_Y                87.967219
Watch_ACC_Z                87.967219
Watch_GRAVITY_X            94.371697
Watch_GRAVITY_Y            94.371697
Watch_GRAVITY_Z            94.371697
Watch_GYRO_X               87.967219
Watch_GYRO_Y               87.967219
Watch_GYRO_Z               87.967219
Watch_LA_X                 94.371353
Watch_LA_Y                 94.371353
Watch_LA_Z                 94.371353
Watch_MAG_X                87.967219
Watch_MAG_Y                87.967219
Watch_MAG_Z                87.967219
Watch_ORI_X                87.967563
Watch_ORI_Y                87.967563
Watch_ORI_Z                87.967563
Watch_ROTATION_VECTOR_A    94.371353
Watch_ROTATION_VECTOR_B    94.371353
Watch_ROTATION_VECTOR_C    94.371353
W

In [48]:
columns_to_drop = missing_values[missing_values>30].index.tolist()
columns_to_drop

['Watch_battery',
 'Watch_pressure',
 'Watch_step_counter',
 'Watch_step_detector',
 'Watch_HEART_RATE_X',
 'Watch_HEART_RATE_Y',
 'Watch_ACC_X',
 'Watch_ACC_Y',
 'Watch_ACC_Z',
 'Watch_GRAVITY_X',
 'Watch_GRAVITY_Y',
 'Watch_GRAVITY_Z',
 'Watch_GYRO_X',
 'Watch_GYRO_Y',
 'Watch_GYRO_Z',
 'Watch_LA_X',
 'Watch_LA_Y',
 'Watch_LA_Z',
 'Watch_MAG_X',
 'Watch_MAG_Y',
 'Watch_MAG_Z',
 'Watch_ORI_X',
 'Watch_ORI_Y',
 'Watch_ORI_Z',
 'Watch_ROTATION_VECTOR_A',
 'Watch_ROTATION_VECTOR_B',
 'Watch_ROTATION_VECTOR_C',
 'Watch_ROTATION_VECTOR_D',
 'Watch_ROTATION_VECTOR_E',
 'Glass_ACC_X',
 'Glass_ACC_Y',
 'Glass_ACC_Z',
 'Glass_GYRO_X',
 'Glass_GYRO_Y',
 'Glass_GYRO_Z',
 'Glass_EOG_L',
 'Glass_EOG_R',
 'Glass_EOG_H',
 'Glass_EOG_V',
 'Phone_battery',
 'Phone_light',
 'Phone_proximity',
 'Phone_step_counter',
 'Phone_step_detector']

In [49]:
data_df.drop(columns=columns_to_drop, inplace=True)

In [50]:
data_df.columns

Index(['Phone_pressure', 'Phone_ACC_X', 'Phone_ACC_Y', 'Phone_ACC_Z',
       'Phone_GRAVITY_X', 'Phone_GRAVITY_Y', 'Phone_GRAVITY_Z', 'Phone_GYRO_X',
       'Phone_GYRO_Y', 'Phone_GYRO_Z', 'Phone_LA_X', 'Phone_LA_Y',
       'Phone_LA_Z', 'Phone_MAG_X', 'Phone_MAG_Y', 'Phone_MAG_Z',
       'Phone_ORI_X', 'Phone_ORI_Y', 'Phone_ORI_Z', 'Phone_AUDIO_W',
       'Phone_AUDIO_X', 'Phone_AUDIO_Y', 'Phone_AUDIO_Z',
       'Phone_ROTATION_VECTOR_A', 'Phone_ROTATION_VECTOR_B',
       'Phone_ROTATION_VECTOR_C', 'Phone_ROTATION_VECTOR_D',
       'Phone_ROTATION_VECTOR_E', 'activity'],
      dtype='object')

# Sensor data count wrt activities

In [51]:
activity_data_count = data_df.groupby('activity').count()
activity_data_count

Unnamed: 0_level_0,Phone_pressure,Phone_ACC_X,Phone_ACC_Y,Phone_ACC_Z,Phone_GRAVITY_X,Phone_GRAVITY_Y,Phone_GRAVITY_Z,Phone_GYRO_X,Phone_GYRO_Y,Phone_GYRO_Z,...,Phone_ORI_Z,Phone_AUDIO_W,Phone_AUDIO_X,Phone_AUDIO_Y,Phone_AUDIO_Z,Phone_ROTATION_VECTOR_A,Phone_ROTATION_VECTOR_B,Phone_ROTATION_VECTOR_C,Phone_ROTATION_VECTOR_D,Phone_ROTATION_VECTOR_E
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
At home,56604,56588,56588,56588,56616,56616,56616,56598,56598,56598,...,56609,56633,56633,56633,56633,56610,56610,56610,56610,56610
At home/Eat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cooking,7264,7264,7264,7264,7264,7264,7264,7264,7264,7264,...,7264,7264,7264,7264,7264,7264,7264,7264,7264,7264
Eat,8709,8709,8709,8709,8709,8709,8709,8709,8709,8709,...,8709,8709,8709,8709,8709,8709,8709,8709,8709,8709
Eat/At home,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,...,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863
Eat/In computer,901,901,901,901,901,901,901,901,901,901,...,901,901,901,901,901,901,901,901,901,901
Eat/Phone was out of the pocket (forgot)/At home,1201,1201,1201,1201,1201,1201,1201,1201,1201,1201,...,1201,1201,1201,1201,1201,1201,1201,1201,1201,1201
Football,60,55,55,55,59,59,59,58,58,58,...,45,60,60,60,60,50,50,50,50,50
In bus,13670,13673,13673,13673,13673,13673,13673,13673,13673,13673,...,13663,13681,13681,13681,13681,13673,13673,13673,13673,13673
In computer,20417,20418,20418,20418,20425,20425,20425,20425,20425,20425,...,20417,20426,20426,20426,20426,20426,20426,20426,20426,20426


In [52]:
# saving it locally for analysis
activity_data_count.to_csv("Activity_vs_SensorDataCount.csv")

In [53]:
len(data_df.activity.value_counts())

34

In [54]:
data_df.shape

(290656, 29)

In [55]:
labels_to_drop = ['At home/Eat','Football','Running','Watching TV/At home']

data_df["is_label_to_drop"] = data_df.activity.map(lambda x: x in labels_to_drop)
data_df = data_df[data_df.is_label_to_drop==False]

data_df.drop(columns=['is_label_to_drop'], inplace=True)

print(data_df.shape)

(288371, 29)


In [56]:
data_df.activity.value_counts()

At home                                             67202
In computer                                         23791
Walk                                                20703
Picnic                                              18601
In bus                                              15607
Walking&party                                       14677
Video games/At home                                 12523
Pause                                               12382
In computer/Work                                     9722
Train                                                9576
In vehicle                                           9549
Sleep                                                9207
Eat                                                  8709
Meeting                                              8353
Cooking                                              7264
In computer/At home                                  6669
Movie                                                6422
Shopping& wear

In [57]:
relabel_dict = {"On bus stop":"On bus stop/Walk",
               "Took off glasses/Shop/Walk":"Shop/Walk",
               "In computer/Eat":"In computer",
               "Eat/In computer":"In computer"}

In [58]:
data_df.activity = data_df.activity.map(lambda x: relabel_dict.get(x,x))

In [59]:
data_df.activity.value_counts()

At home                                             67202
In computer                                         26793
Walk                                                20703
Picnic                                              18601
In bus                                              15607
Walking&party                                       14677
Video games/At home                                 12523
Pause                                               12382
In computer/Work                                     9722
Train                                                9576
In vehicle                                           9549
Sleep                                                9207
Eat                                                  8709
Meeting                                              8353
Cooking                                              7264
In computer/At home                                  6669
Movie                                                6422
Shopping& wear

In [60]:
data_df.isna().sum()

Phone_pressure             38004
Phone_ACC_X                38005
Phone_ACC_Y                38005
Phone_ACC_Z                38005
Phone_GRAVITY_X            37981
Phone_GRAVITY_Y            37981
Phone_GRAVITY_Z            37981
Phone_GYRO_X               37988
Phone_GYRO_Y               37988
Phone_GYRO_Z               37988
Phone_LA_X                 37981
Phone_LA_Y                 37981
Phone_LA_Z                 37981
Phone_MAG_X                37969
Phone_MAG_Y                37969
Phone_MAG_Z                37969
Phone_ORI_X                38006
Phone_ORI_Y                38006
Phone_ORI_Z                38006
Phone_AUDIO_W              37910
Phone_AUDIO_X              37910
Phone_AUDIO_Y              37910
Phone_AUDIO_Z              37910
Phone_ROTATION_VECTOR_A    37988
Phone_ROTATION_VECTOR_B    37988
Phone_ROTATION_VECTOR_C    37988
Phone_ROTATION_VECTOR_D    37988
Phone_ROTATION_VECTOR_E    37988
activity                       0
dtype: int64

In [61]:
data_df.columns

Index(['Phone_pressure', 'Phone_ACC_X', 'Phone_ACC_Y', 'Phone_ACC_Z',
       'Phone_GRAVITY_X', 'Phone_GRAVITY_Y', 'Phone_GRAVITY_Z', 'Phone_GYRO_X',
       'Phone_GYRO_Y', 'Phone_GYRO_Z', 'Phone_LA_X', 'Phone_LA_Y',
       'Phone_LA_Z', 'Phone_MAG_X', 'Phone_MAG_Y', 'Phone_MAG_Z',
       'Phone_ORI_X', 'Phone_ORI_Y', 'Phone_ORI_Z', 'Phone_AUDIO_W',
       'Phone_AUDIO_X', 'Phone_AUDIO_Y', 'Phone_AUDIO_Z',
       'Phone_ROTATION_VECTOR_A', 'Phone_ROTATION_VECTOR_B',
       'Phone_ROTATION_VECTOR_C', 'Phone_ROTATION_VECTOR_D',
       'Phone_ROTATION_VECTOR_E', 'activity'],
      dtype='object')

In [62]:
features = ['Phone_pressure', 'Phone_ACC_X', 'Phone_ACC_Y', 'Phone_ACC_Z',
       'Phone_GRAVITY_X', 'Phone_GRAVITY_Y', 'Phone_GRAVITY_Z', 'Phone_GYRO_X',
       'Phone_GYRO_Y', 'Phone_GYRO_Z', 'Phone_LA_X', 'Phone_LA_Y',
       'Phone_LA_Z', 'Phone_MAG_X', 'Phone_MAG_Y', 'Phone_MAG_Z',
       'Phone_ORI_X', 'Phone_ORI_Y', 'Phone_ORI_Z', 'Phone_AUDIO_W',
       'Phone_AUDIO_X', 'Phone_AUDIO_Y', 'Phone_AUDIO_Z',
       'Phone_ROTATION_VECTOR_A', 'Phone_ROTATION_VECTOR_B',
       'Phone_ROTATION_VECTOR_C', 'Phone_ROTATION_VECTOR_D',
       'Phone_ROTATION_VECTOR_E']

In [63]:
data_df.Phone_ACC_X.median()

3.3817677

In [64]:
for feature in features:
    median = data_df[feature].median()
    data_df[feature].fillna(median, inplace=True)

In [65]:
data_df.isna().sum()

Phone_pressure             0
Phone_ACC_X                0
Phone_ACC_Y                0
Phone_ACC_Z                0
Phone_GRAVITY_X            0
Phone_GRAVITY_Y            0
Phone_GRAVITY_Z            0
Phone_GYRO_X               0
Phone_GYRO_Y               0
Phone_GYRO_Z               0
Phone_LA_X                 0
Phone_LA_Y                 0
Phone_LA_Z                 0
Phone_MAG_X                0
Phone_MAG_Y                0
Phone_MAG_Z                0
Phone_ORI_X                0
Phone_ORI_Y                0
Phone_ORI_Z                0
Phone_AUDIO_W              0
Phone_AUDIO_X              0
Phone_AUDIO_Y              0
Phone_AUDIO_Z              0
Phone_ROTATION_VECTOR_A    0
Phone_ROTATION_VECTOR_B    0
Phone_ROTATION_VECTOR_C    0
Phone_ROTATION_VECTOR_D    0
Phone_ROTATION_VECTOR_E    0
activity                   0
dtype: int64

In [69]:
data_df.to_csv("data/P4_engineered_data.csv")