In [48]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [49]:
from General import *
from ReadingTheDataUtils import *

# Make the notebook automatically reload external python modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
from pathlib import Path

ROOT_PATH = Path('..')
SRC_PATH = Path('.')
DATA_PATH = ROOT_PATH / 'data'
CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_features_labels'
CSV_SUFFIX = '.features_labels.csv'
ORIGINAL_LABLES_CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_original_labels'
ORIGINAL_LABLES_CSV_SUFFIX = '.original_labels.csv'
FOLD_PATH = DATA_PATH / 'cv_5_folds'

main_label_names = ['label:LYING_DOWN', 'label:SITTING', 'label:OR_standing', 'label:FIX_walking', 'label:FIX_running', 'label:BICYCLING']

### Read the features data

In [51]:
%%time
# Read all data
features_df = get_all_uuids(CSV_PATH)

Wall time: 12.4 s


In [52]:
label_names = get_label_names(features_df)
label_names

['label:LYING_DOWN',
 'label:SITTING',
 'label:FIX_walking',
 'label:FIX_running',
 'label:BICYCLING',
 'label:SLEEPING',
 'label:LAB_WORK',
 'label:IN_CLASS',
 'label:IN_A_MEETING',
 'label:LOC_main_workplace',
 'label:OR_indoors',
 'label:OR_outside',
 'label:IN_A_CAR',
 'label:ON_A_BUS',
 'label:DRIVE_-_I_M_THE_DRIVER',
 'label:DRIVE_-_I_M_A_PASSENGER',
 'label:LOC_home',
 'label:FIX_restaurant',
 'label:PHONE_IN_POCKET',
 'label:OR_exercise',
 'label:COOKING',
 'label:SHOPPING',
 'label:STROLLING',
 'label:DRINKING__ALCOHOL_',
 'label:BATHING_-_SHOWER',
 'label:CLEANING',
 'label:DOING_LAUNDRY',
 'label:WASHING_DISHES',
 'label:WATCHING_TV',
 'label:SURFING_THE_INTERNET',
 'label:AT_A_PARTY',
 'label:AT_A_BAR',
 'label:LOC_beach',
 'label:SINGING',
 'label:TALKING',
 'label:COMPUTER_WORK',
 'label:EATING',
 'label:TOILET',
 'label:GROOMING',
 'label:DRESSING',
 'label:AT_THE_GYM',
 'label:STAIRS_-_GOING_UP',
 'label:STAIRS_-_GOING_DOWN',
 'label:ELEVATOR',
 'label:OR_standing',
 'l

In [53]:
label_df = features_df[main_label_names + ['timestamp']]
features_df = features_df[get_feature_names(features_df, label_names) + ['timestamp']]

In [54]:
features_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 377346 entries, 00EABED2-271D-49D8-B599-1D4A09240601 to FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF
Data columns (total 226 columns):
raw_acc:magnitude_stats:mean                                               float64
raw_acc:magnitude_stats:std                                                float64
raw_acc:magnitude_stats:moment3                                            float64
raw_acc:magnitude_stats:moment4                                            float64
raw_acc:magnitude_stats:percentile25                                       float64
raw_acc:magnitude_stats:percentile50                                       float64
raw_acc:magnitude_stats:percentile75                                       float64
raw_acc:magnitude_stats:value_entropy                                      float64
raw_acc:magnitude_stats:time_entropy                                       float64
raw_acc:magnitude_spectrum:log_energy_band0                                float6

In [55]:
features_df.head()

Unnamed: 0_level_0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,lf_measurements:temperature_ambient,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,timestamp
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,5.04397,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079161
00EABED2-271D-49D8-B599-1D4A09240601,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,5.043367,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079221
00EABED2-271D-49D8-B599-1D4A09240601,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,5.043599,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079281
00EABED2-271D-49D8-B599-1D4A09240601,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,5.043263,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079341
00EABED2-271D-49D8-B599-1D4A09240601,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,5.042779,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079431


In [56]:
label_df.head()

Unnamed: 0_level_0,label:LYING_DOWN,label:SITTING,label:OR_standing,label:FIX_walking,label:FIX_running,label:BICYCLING,timestamp
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00EABED2-271D-49D8-B599-1D4A09240601,0.0,1.0,0.0,0.0,,,1444079161
00EABED2-271D-49D8-B599-1D4A09240601,0.0,1.0,0.0,0.0,,,1444079221
00EABED2-271D-49D8-B599-1D4A09240601,0.0,1.0,0.0,0.0,,,1444079281
00EABED2-271D-49D8-B599-1D4A09240601,0.0,1.0,0.0,0.0,,,1444079341
00EABED2-271D-49D8-B599-1D4A09240601,0.0,1.0,0.0,0.0,,,1444079431


In [57]:
label_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 377346 entries, 00EABED2-271D-49D8-B599-1D4A09240601 to FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF
Data columns (total 7 columns):
label:LYING_DOWN     303723 non-null float64
label:SITTING        306594 non-null float64
label:OR_standing    306594 non-null float64
label:FIX_walking    306594 non-null float64
label:FIX_running    140870 non-null float64
label:BICYCLING      135183 non-null float64
timestamp            377346 non-null int64
dtypes: float64(6), int64(1)
memory usage: 23.0+ MB


### Create the dataset

In [58]:
data = pd.merge(features_df, label_df,
                how='left',
                left_on=['uuid','timestamp'], right_on = ['uuid','timestamp'])

In [59]:
data.head()

Unnamed: 0_level_0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,timestamp,label:LYING_DOWN,label:SITTING,label:OR_standing,label:FIX_walking,label:FIX_running,label:BICYCLING
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,5.04397,...,0.0,0.0,0.0,1444079161,0.0,1.0,0.0,0.0,,
00EABED2-271D-49D8-B599-1D4A09240601,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,5.043367,...,0.0,0.0,0.0,1444079221,0.0,1.0,0.0,0.0,,
00EABED2-271D-49D8-B599-1D4A09240601,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,5.043599,...,0.0,0.0,0.0,1444079281,0.0,1.0,0.0,0.0,,
00EABED2-271D-49D8-B599-1D4A09240601,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,5.043263,...,0.0,0.0,0.0,1444079341,0.0,1.0,0.0,0.0,,
00EABED2-271D-49D8-B599-1D4A09240601,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,5.042779,...,0.0,0.0,0.0,1444079431,0.0,1.0,0.0,0.0,,


In [61]:
# # Testing
# from itertools import combinations

# def is_mutual_exclusive(series_1, series_2):
#     return  series_1 & series_2

# for cols in combinations(data[main_label_names].columns, 2):
#     series_1 = data[cols[0]]
#     series_2 = data[cols[1]]
    
#     mask = is_mutual_exclusive(series_1, series_2)
    
#     if data[mask].shape[0] > 0:
#         raise ValueError("the lables contains unmutual exclusive rows")

TypeError: unsupported operand type(s) for &: 'float' and 'bool'

In [63]:
# Mapping between label name and a unique category number
def create_mapping_dict(main_activity_lables):
    main_activity_lables_mapping = dict()

    for category, label in enumerate(main_activity_lables):
        main_activity_lables_mapping[category] = label
        
    return main_activity_lables_mapping
    
def map_label_index_to_name(index, main_activity_lables_mapping):
    label_name = main_activity_lables_mapping[index]
    
    return label_name


main_activity_lables_mapping = create_mapping_dict(main_label_names)
data['label'] = np.argmax(data[main_label_names].values, axis=1)
data['label_name'] = data['label'].apply(map_label_index_to_name, main_activity_lables_mapping=main_activity_lables_mapping)

data.drop(main_label_names, inplace=True, axis=1)

In [64]:
data.head()

Unnamed: 0_level_0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,timestamp,label,label_name
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,5.04397,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079161,4,label:FIX_running
00EABED2-271D-49D8-B599-1D4A09240601,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,5.043367,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079221,4,label:FIX_running
00EABED2-271D-49D8-B599-1D4A09240601,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,5.043599,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079281,4,label:FIX_running
00EABED2-271D-49D8-B599-1D4A09240601,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,5.043263,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079341,4,label:FIX_running
00EABED2-271D-49D8-B599-1D4A09240601,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,5.042779,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1444079431,4,label:FIX_running


In [65]:
data.shape

(377346, 228)

In [66]:
data.describe()

Unnamed: 0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,timestamp,label
count,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,...,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0
mean,1.002223,0.038832,0.037772,0.072804,0.983165,0.998471,1.016691,2.045331,6.680278,5.039597,...,0.218865,0.206169,0.229066,0.267147,0.285608,0.285624,0.266461,0.276425,1445839000.0,3.064241
std,0.079623,0.096109,0.113198,0.170481,0.08296,0.076687,0.104874,0.616899,0.021246,0.025809,...,0.413478,0.404554,0.420232,0.442471,0.451704,0.451712,0.442109,0.44723,5907009.0,1.950021
min,0.018148,3e-05,-0.493806,3.9e-05,0.015845,0.017998,0.020365,0.009605,5.460637,4.338109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1433537000.0,0.0
25%,0.992556,0.001709,-0.000785,0.002328,0.982671,0.991248,0.994859,1.671571,6.684369,5.042891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1441438000.0,1.0
50%,1.001258,0.003265,0.000772,0.005051,0.9953,1.000116,1.003813,2.296398,6.684606,5.04335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1444974000.0,4.0
75%,1.012745,0.021366,0.009864,0.045333,1.003679,1.009232,1.019163,2.523618,6.68461,5.043574,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1448696000.0,4.0
max,3.185837,1.936343,2.47275,3.360718,1.942718,2.636697,3.958338,2.971272,6.684612,6.489025,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1464899000.0,5.0


In [67]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 377346 entries, 00EABED2-271D-49D8-B599-1D4A09240601 to FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF
Data columns (total 228 columns):
raw_acc:magnitude_stats:mean                                               377056 non-null float64
raw_acc:magnitude_stats:std                                                377056 non-null float64
raw_acc:magnitude_stats:moment3                                            377056 non-null float64
raw_acc:magnitude_stats:moment4                                            377056 non-null float64
raw_acc:magnitude_stats:percentile25                                       377056 non-null float64
raw_acc:magnitude_stats:percentile50                                       377056 non-null float64
raw_acc:magnitude_stats:percentile75                                       377056 non-null float64
raw_acc:magnitude_stats:value_entropy                                      377056 non-null float64
raw_acc:magnitude_stats:time_entropy

In [68]:
data.to_csv(DATA_PATH / "dataset.csv")