In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from General import *
from ReadingTheDataUtils import *

# Make the notebook automatically reload external python modules
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

ROOT_PATH = Path('..')
SRC_PATH = Path('.')
DATA_PATH = ROOT_PATH / 'data'
CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_features_labels'
CSV_SUFFIX = '.features_labels.csv'
ORIGINAL_LABLES_CSV_PATH = DATA_PATH / 'ExtraSensory.per_uuid_original_labels'
ORIGINAL_LABLES_CSV_SUFFIX = '.original_labels.csv'
FOLD_PATH = DATA_PATH / 'cv_5_folds'

### Read the features data

In [None]:
%%time
# Read all data
features_df = get_all_uuids(CSV_PATH)

optimize_features_data(features_df)

In [8]:
features_df.info(verbose=True)

NameError: name 'features_df' is not defined

In [None]:
features_df.head()

### Reading the original lables 

In [None]:
%%time
original_labels_df = get_all_uuids(ORIGINAL_LABLES_CSV_PATH)
MAIN_ACTIVITY_LABLES = ['original_label:LYING_DOWN', 'original_label:SITTING',
                       'original_label:STANDING_IN_PLACE', 'original_label:STANDING_AND_MOVING',
                       'original_label:WALKING', 'original_label:RUNNING',
                       'original_label:BICYCLING']

original_labels_df = original_labels_df[MAIN_ACTIVITY_LABLES + ['timestamp']]
original_labels_df[MAIN_ACTIVITY_LABLES] = original_labels_df[MAIN_ACTIVITY_LABLES].astype('category')

In [None]:
original_labels_df.head()

### Create the dataset

In [None]:
data = pd.merge(features_df, original_labels_df,
                how='left',
                left_on=['uuid','timestamp'], right_on = ['uuid','timestamp'])

In [None]:
data.head()

In [None]:
# Testing
from itertools import combinations

def is_mutual_exclusive(series_1, series_2):
    return  series_1 & series_2

for cols in combinations(data[MAIN_ACTIVITY_LABLES].columns, 2):
    series_1 = data[cols[0]]
    series_2 = data[cols[1]]
    
    mask = is_mutual_exclusive(series_1, series_2)
    
    if data[mask].shape[0] > 0:
        raise ValueError("the lables contains unmutual exclusive rows")

In [None]:
# Mapping between label name and a unique category number
def create_mapping_dict(main_activity_lables):
    main_activity_lables_mapping = dict()

    for category, label in enumerate(main_activity_lables):
        main_activity_lables_mapping[category] = label
        
    return main_activity_lables_mapping
    
def map_label_index_to_name(index, main_activity_lables_mapping):
    label_name = main_activity_lables_mapping[index]
    
    return label_name


main_activity_lables_mapping = create_mapping_dict(MAIN_ACTIVITY_LABLES)
data['label'] = np.argmax(data[MAIN_ACTIVITY_LABLES].values, axis=1)
data['label_name'] = data['label'].apply(map_label_index_to_name, main_activity_lables_mapping=main_activity_lables_mapping)

data.drop(MAIN_ACTIVITY_LABLES, inplace=True, axis=1)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info(verbose=True, null_counts=True)

In [None]:
data.to_csv(DATA_PATH / "dataset.csv")