# 1. Introduction
This part of the project consists of building an activity recognition system. The activities are classified as low, medium, and high. The data is collected from the participants of the AffecTech project. The participants sit during lectures and meeting, lie down during mindfullness, and stand during presentation.

* low: sitting, standing, lie down    
* medium: walking   
* high: exercising

# 2. Data
The given data includes Empatica data for each device and information tables. The information tables from the project consists of 4 files:
* Mapping from participant number to device numbers
* Presentation start and end times excluding Q&A session for each participant in October 7, 2018.
* Presentation start and end times including Q&A session for each participant in October 6, 2018.
* Start and end times for walking, sitting, exercising, and standing for all participants between September 30, 2018 and October 7, 2018.

# 3. Data preprocessing
## 3.1. Get accelerometer data from Empatica

In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [6]:
import os
import glob

os.chdir(os.getcwd())
acc_files = glob.glob('ACC.csv' )
print(acc_files)

In [7]:
import re

def append_acc_file_to_dataframe(file_name):
    file_location = cwd + "/" + file_name
    with open(file_location, "r") as f:
        reader = csv.reader(f, delimiter=",")
        start_time = 0
        values = []
        times = []
        prev_time = 0
        for i, line in enumerate(reader):
            if i is 0:
                start_time = float(line[0])
                prev_time = start_time
            elif i is 1:
                frequency_acc = float(line[1])
            else:
                if start_time + 10 <= prev_time: # Since all sensors start collecting data at the same instant except HR which starts after 10 s
                    times.append(prev_time)
                    values.append(np.array(line).astype(np.float))
                prev_time += 1/frequency_acc
    return pd.DataFrame(
        {'time': times,
         'value_acc': values,
         'device': re.findall("_(.*?)\/", file_name)[0]
        })

In [8]:
cwd = os.getcwd()  # Get the current working directory (cwd)
acc_list = []
for file in acc_files:
    acc_list.append(append_acc_file_to_dataframe(file))

## 3.2. Convert data from time domain to frequency domain

In [9]:
import statistics

def acc_time_domain_to_frequency_domain(df, frequency):
    brand_new_df = pd.DataFrame()
    num_rows = df.shape[0]
    i = 0
    while i < (num_rows - 8 * 60 * frequency):
        x = df.iloc[i:(i + (8 * 60 * int(frequency)))]['value_acc'].str[0].astype(float).as_matrix()
        y = df.iloc[i:(i + (8 * 60 * int(frequency)))]['value_acc'].str[1].astype(float).as_matrix()
        z = df.iloc[i:(i + (8 * 60 * int(frequency)))]['value_acc'].str[2].astype(float).as_matrix()
        mean_x = sum(x)/len(x)
        mean_y = sum(y)/len(y)
        mean_z = sum(z)/len(z)
        mag_mean = (mean_x**2 + mean_y**2 + mean_z**2)**(1/2)
        std_x = statistics.stdev(x)
        std_y = statistics.stdev(y)
        std_z = statistics.stdev(z)
        mag_std = (std_x**2 + std_y**2 + std_z**2)**(1/2)
        mag = [(a**2 + b**2 + c**2)**(1/2) for a,b,c in zip(x, y, z)]
        fft_mag = np.fft.fft(mag)
        f = np.linspace(0, frequency, len(mag))
        df_new = pd.DataFrame(
                    {
                     'start_time': df['time'][i],
                     'end_time': df['time'][i + (8 * 60 * int(frequency)) - 1],
                     'mean_x': [mean_x],
                     'mean_y': [mean_y],
                     'mean_z': [mean_z],
                     'mag_mean': [mag_mean],
                     'std_x': [std_x],
                     'std_y': [std_y],
                     'std_z': [std_z],
                     'mag_std': [mag_std],
                     'mag': [mag],
                     'fft': max(fft_mag),
                     'device': df['device'][i]
                    })
        i = i + (8 * 60 * int(frequency))
        brand_new_df = pd.concat([brand_new_df, df_new], ignore_index=True)
    return brand_new_df

In [10]:
acc_df = []
for frame in acc_list:
    acc_df.append(acc_time_domain_to_frequency_domain(frame, 32))
acc_df = pd.concat(acc_df, ignore_index=True)

## 3.3. Add label data from information tables
activity_labels.csv has the following columns:
* **UTC_start:** Start time of the activity
* **UTC_end:** End time of the activity
* **date:** Date of the activity (month and day)
* **label:** Level of the activity (low, medium, or high)
* **stress:** Predefined stress level of the activity (btw. 1-5)

In [11]:
label_df = pd.read_csv("activity_labels.csv", sep='\t', encoding='utf-8')

## 3.4. Merge Empatica data and label data

In [12]:
acc_df['fft_real'] = acc_df['fft'].real
acc_df.drop(['fft'], axis=1, inplace=True)
acc_df.drop(['mag'], axis=1, inplace=True)

In [13]:
# Merge label data and accelerometer data
import sqlite3

conn = sqlite3.connect(':memory:')
acc_df.to_sql('acc_df', conn, index=False)
label_df.to_sql('label_df', conn, index=False)

qry = '''
    select
        acc_df.mag_mean mag_mean,
        acc_df.mag_std mag_std,
        acc_df.mean_x mean_x,
        acc_df.mean_y mean_y,
        acc_df.mean_z mean_z,
        acc_df.std_x std_x,
        acc_df.std_y std_y,
        acc_df.std_z std_z,
        acc_df.fft_real fft_real,
        acc_df.start_time UTC_start,
        acc_df.end_time UTC_end,
        label_df.label label,
        label_df.stress stress
    from
        label_df
    join
        acc_df
    on
        (acc_df.start_time >= label_df.UTC_start) and (label_df.UTC_end >= acc_df.end_time)
    '''
acc_label_df = pd.read_sql_query(qry, conn)

  chunksize=chunksize, dtype=dtype)


## 3.5. Handling imbalanced classes

In [14]:
df_low = acc_label_df[acc_label_df.label=="low"]
df_medium = acc_label_df[acc_label_df.label=="medium"]
df_high = acc_label_df[acc_label_df.label=="high"]

# Number of rows for all labels
print("Number of rows with low activity: ", df_low.shape[0])
print("Number of rows with medium activity: ", df_medium.shape[0])
print("Number of rows with high activity: ", df_high.shape[0])

Number of rows with low activity:  1389
Number of rows with medium activity:  217
Number of rows with high activity:  22


Since Empatica data for low, medium and high activities are imbalanced, it is crucial to balance the classes. The class with least number of instances (high) has 22 instances and the class with most number of instances (low) has 1389 instances. In order to provide the machine learning models with sufficient number of instances, we will upsample the data.

In [15]:
from sklearn.utils import resample

n_rows = df_low.shape[0]

df_high_upsampled = resample(df_high, 
                                 replace=True,     # sample with replacement
                                 n_samples=n_rows,
                                 random_state=123) # in order to attain reproducible results

df_medium_upsampled = resample(df_medium, 
                                 replace=True,     # sample with replacement
                                 n_samples=n_rows,
                                 random_state=123) # in order to attain reproducible results

df_upsampled = pd.concat([df_low, df_high_upsampled, df_medium_upsampled])

# 4. Machine Learning

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectFromModel



In [17]:
X = df_upsampled[['mag_mean', 'mag_std', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 'fft_real']]
y = df_upsampled[['label']]

## 4.1. Logistic Regression
### 4.1.1. Feature Selection

In [18]:
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5, multi_class='ovr', class_weight='balanced')
sfm = SelectFromModel(clf, threshold="0.75*mean")
sfm.fit(X, y)
selected_features = np.array(list(X.columns))
result = selected_features[sfm.get_support()]
result.tolist()

  y = column_or_1d(y, warn=True)


['mag_mean', 'mag_std', 'mean_y', 'mean_z', 'std_x', 'std_y']

According to **SelectFromModel**, mag_std, mean_y, mean_z, std_x, and std_y are selected as features for logistic regression classifier
### 4.1.2. Model application

In [19]:
X_lr = df_upsampled[['mag_mean', 'mag_std', 'mean_y', 'mean_z', 'std_x', 'std_y']]
y_lr = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_lr, y_lr, test_size=0.2)

clf = LogisticRegressionCV(cv=5, multi_class='ovr', class_weight='balanced').fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.8069544364508393

## 4.2. Naive Bayes
### 4.2.1. Model application

In [21]:
from sklearn.naive_bayes import GaussianNB

X_nb = df_upsampled[['mag_mean', 'mag_std', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 'fft_real']]
y_nb = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_nb, y_nb, test_size=0.2)

gnb = GaussianNB().fit(X_train, y_train)
gnb.predict(X_test)
gnb.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.8141486810551559

## 4.3. Decision Tree
### 4.3.1. Feature Selection

In [22]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(class_weight='balanced')
sfm = SelectFromModel(clf, threshold="0.75*mean")
sfm.fit(X, y)
selected_features = np.array(list(X.columns))
result = selected_features[sfm.get_support()]
result.tolist()

['mag_std', 'mean_z', 'fft_real']

According to **SelectFromModel**, mag_std and mean_z are selected as features for Decision Tree classifier.
### 4.3.2. Model application

In [39]:
X_dt = df_upsampled[['mag_std', 'mean_z','fft_real']]
y_dt = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_dt, y_dt, test_size=0.05)

clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

0.9617224880382775

In [40]:
X_dt = df_upsampled[['mag_std', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 'fft_real']]
y_dt = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_dt, y_dt, test_size=0.05)

clf = tree.DecisionTreeClassifier(class_weight='balanced')
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

0.9808612440191388

## 4.4. Multi-layer Perceptron
### 4.4.1. Model Application

In [41]:
from sklearn.neural_network import MLPClassifier

X_mlp = df_upsampled[['mag_mean', 'mag_std', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 'fft_real']]
y_mlp = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_mlp, y_mlp, test_size=0.05)

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.31100478468899523

# 5. Conclusion

Highest score is 0.98 and is attained by decision tree. We can use this classifier during stress detection in order to see whether stress indicating signals are caused by high activity or stressful conditions.

In [42]:
X_dt = df_upsampled[['mag_mean', 'mag_std', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 'fft_real']]
y_dt = df_upsampled[['label']]

X_train, X_test, y_train, y_test = train_test_split(X_dt, y_dt, test_size=0.05)

clf = tree.DecisionTreeClassifier(class_weight='balanced')
clf.fit(X_train, y_train)
result = clf.predict(X)

In [43]:
df_upsampled['predicted_y'] = result

In [44]:
df_upsampled.to_csv("detected_activity.csv", sep='\t', encoding='utf-8')