In [7]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
from General import *
from ReadingTheDataUtils import *

# Make the notebook automatically reload external python modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from pathlib import Path

ROOT_PATH = Path('..')
SRC_PATH = Path('.')
DATA_PATH = ROOT_PATH / 'data'
CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_features_labels'
CSV_SUFFIX = '.features_labels.csv'
ORIGINAL_LABLES_CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_original_labels'
ORIGINAL_LABLES_CSV_SUFFIX = '.original_labels.csv'
FOLD_PATH = DATA_PATH / 'cv_5_folds'

### Read the features data

In [10]:
%%time
# Read all data
features_df = get_all_uuids(CSV_PATH)

optimize_features_data(features_df)

Wall time: 27.1 s


In [11]:
features_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 377346 entries, 00EABED2-271D-49D8-B599-1D4A09240601 to FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF
Data columns (total 226 columns):
timestamp                                                                  int64
raw_acc:magnitude_stats:mean                                               float64
raw_acc:magnitude_stats:std                                                float64
raw_acc:magnitude_stats:moment3                                            float64
raw_acc:magnitude_stats:moment4                                            float64
raw_acc:magnitude_stats:percentile25                                       float64
raw_acc:magnitude_stats:percentile50                                       float64
raw_acc:magnitude_stats:percentile75                                       float64
raw_acc:magnitude_stats:value_entropy                                      float64
raw_acc:magnitude_stats:time_entropy                                       float64


In [12]:
features_df.head()

Unnamed: 0_level_0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,lf_measurements:screen_brightness,lf_measurements:temperature_ambient,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,1444079161,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,...,0.381436,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
00EABED2-271D-49D8-B599-1D4A09240601,1444079221,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,...,0.381436,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
00EABED2-271D-49D8-B599-1D4A09240601,1444079281,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,...,0.381436,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
00EABED2-271D-49D8-B599-1D4A09240601,1444079341,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,...,0.381436,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
00EABED2-271D-49D8-B599-1D4A09240601,1444079431,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,...,0.381436,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


### Reading the original lables 

In [13]:
%%time
original_labels_df = get_all_uuids(ORIGINAL_LABLES_CSV_PATH)
MAIN_ACTIVITY_LABLES = ['original_label:LYING_DOWN', 'original_label:SITTING',
                       'original_label:STANDING_IN_PLACE', 'original_label:STANDING_AND_MOVING',
                       'original_label:WALKING', 'original_label:RUNNING',
                       'original_label:BICYCLING']

original_labels_df = original_labels_df[MAIN_ACTIVITY_LABLES + ['timestamp']]
original_labels_df[MAIN_ACTIVITY_LABLES] = original_labels_df[MAIN_ACTIVITY_LABLES].astype('category')

Wall time: 3.46 s


In [14]:
original_labels_df.head()

Unnamed: 0_level_0,original_label:LYING_DOWN,original_label:SITTING,original_label:STANDING_IN_PLACE,original_label:STANDING_AND_MOVING,original_label:WALKING,original_label:RUNNING,original_label:BICYCLING,timestamp
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00EABED2-271D-49D8-B599-1D4A09240601,0,1,0,0,0,0,0,1444079161
00EABED2-271D-49D8-B599-1D4A09240601,0,1,0,0,0,0,0,1444079221
00EABED2-271D-49D8-B599-1D4A09240601,0,1,0,0,0,0,0,1444079281
00EABED2-271D-49D8-B599-1D4A09240601,0,1,0,0,0,0,0,1444079341
00EABED2-271D-49D8-B599-1D4A09240601,0,1,0,0,0,0,0,1444079431


### Create the dataset

In [15]:
data = pd.merge(features_df, original_labels_df,
                how='left',
                left_on=['uuid','timestamp'], right_on = ['uuid','timestamp'])

In [16]:
data.head()

Unnamed: 0_level_0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,original_label:LYING_DOWN,original_label:SITTING,original_label:STANDING_IN_PLACE,original_label:STANDING_AND_MOVING,original_label:WALKING,original_label:RUNNING,original_label:BICYCLING
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,1444079161,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,...,0.0,0.0,0.0,0,1,0,0,0,0,0
00EABED2-271D-49D8-B599-1D4A09240601,1444079221,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,...,0.0,0.0,0.0,0,1,0,0,0,0,0
00EABED2-271D-49D8-B599-1D4A09240601,1444079281,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,...,0.0,0.0,0.0,0,1,0,0,0,0,0
00EABED2-271D-49D8-B599-1D4A09240601,1444079341,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,...,0.0,0.0,0.0,0,1,0,0,0,0,0
00EABED2-271D-49D8-B599-1D4A09240601,1444079431,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,...,0.0,0.0,0.0,0,1,0,0,0,0,0


In [17]:
# Testing
from itertools import combinations

def is_mutual_exclusive(series_1, series_2):
    return  series_1 & series_2

for cols in combinations(data[MAIN_ACTIVITY_LABLES].columns, 2):
    series_1 = data[cols[0]]
    series_2 = data[cols[1]]
    
    mask = is_mutual_exclusive(series_1, series_2)
    
    if data[mask].shape[0] > 0:
        raise ValueError("the lables contains unmutual exclusive rows")

In [18]:
# Mapping between label name and a unique category number
def create_mapping_dict(main_activity_lables):
    main_activity_lables_mapping = dict()

    for category, label in enumerate(main_activity_lables):
        main_activity_lables_mapping[category] = label
        
    return main_activity_lables_mapping
    
def map_label_index_to_name(index, main_activity_lables_mapping):
    label_name = main_activity_lables_mapping[index]
    
    return label_name


main_activity_lables_mapping = create_mapping_dict(MAIN_ACTIVITY_LABLES)
data['label'] = np.argmax(data[MAIN_ACTIVITY_LABLES].values, axis=1)
data['label_name'] = data['label'].apply(map_label_index_to_name, main_activity_lables_mapping=main_activity_lables_mapping)

data.drop(MAIN_ACTIVITY_LABLES, inplace=True, axis=1)

In [19]:
data.head()

Unnamed: 0_level_0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,label,label_name
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EABED2-271D-49D8-B599-1D4A09240601,1444079161,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,original_label:SITTING
00EABED2-271D-49D8-B599-1D4A09240601,1444079221,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,original_label:SITTING
00EABED2-271D-49D8-B599-1D4A09240601,1444079281,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,original_label:SITTING
00EABED2-271D-49D8-B599-1D4A09240601,1444079341,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,original_label:SITTING
00EABED2-271D-49D8-B599-1D4A09240601,1444079431,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,original_label:SITTING


In [20]:
data.shape

(377346, 228)

In [21]:
data.describe()

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,audio_properties:normalization_multiplier,lf_measurements:light,lf_measurements:pressure,lf_measurements:proximity_cm,lf_measurements:proximity,lf_measurements:relative_humidity,lf_measurements:battery_level,lf_measurements:screen_brightness,lf_measurements:temperature_ambient,label
count,377346.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,...,369953.0,147032.0,107195.0,148620.0,220949.0,25939.0,377027.0,220949.0,26450.0,377346.0
mean,1445839000.0,1.002223,0.038832,0.037772,0.072804,0.983165,0.998471,1.016691,2.045331,6.680278,...,-1.957999,-3.250955,981.716903,12.945908,0.0,58.577262,0.663414,0.326232,25.690594,0.976655
std,5907009.0,0.079623,0.096109,0.113198,0.170481,0.08296,0.076687,0.104874,0.616899,0.021246,...,5.693189,7.50459,142.092076,25.187935,0.0,12.39354,0.2857,0.29635,3.384491,1.313158
min,1433537000.0,0.018148,3e-05,-0.493806,3.9e-05,0.015845,0.017998,0.020365,0.009605,5.460637,...,-10.113775,-11.512925,0.0,0.0,0.0,0.0,0.0,0.0,11.455976,0.0
25%,1441438000.0,0.992556,0.001709,-0.000785,0.002328,0.982671,0.991248,0.994859,1.671571,6.684369,...,-8.325606,-11.512925,999.039978,5.0,0.0,49.795475,0.44,0.043978,23.9416,0.0
50%,1444974000.0,1.001258,0.003265,0.000772,0.005051,0.9953,1.000116,1.003813,2.296398,6.684606,...,0.272567,1.098616,1002.379089,8.0,0.0,59.389294,0.7,0.305609,25.545454,1.0
75%,1448696000.0,1.012745,0.021366,0.009864,0.045333,1.003679,1.009232,1.019163,2.523618,6.68461,...,2.843984,3.401198,1005.782715,8.0,0.0,67.649941,0.95,0.461145,27.650335,1.0
max,1464899000.0,3.185837,1.936343,2.47275,3.360718,1.942718,2.636697,3.958338,2.971272,6.684612,...,8.317724,12.263775,1021.61853,100.0,0.0,100.0,1.0,1.0,132.231644,6.0


In [22]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 377346 entries, 00EABED2-271D-49D8-B599-1D4A09240601 to FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF
Data columns (total 228 columns):
timestamp                                                                  377346 non-null int64
raw_acc:magnitude_stats:mean                                               377056 non-null float64
raw_acc:magnitude_stats:std                                                377056 non-null float64
raw_acc:magnitude_stats:moment3                                            377056 non-null float64
raw_acc:magnitude_stats:moment4                                            377056 non-null float64
raw_acc:magnitude_stats:percentile25                                       377056 non-null float64
raw_acc:magnitude_stats:percentile50                                       377056 non-null float64
raw_acc:magnitude_stats:percentile75                                       377056 non-null float64
raw_acc:magnitude_stats:value_entropy 

In [23]:
data.to_csv(DATA_PATH / "dataset.csv")