# Import Required Packages

In [None]:
from sklearn import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
import random
from sklearn.model_selection import train_test_split
import collections
import os
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Data Reading

In [None]:
# Path to the data directory
data_path = os.getcwd() + '/IoT_ASN4_Group11/data/'

# Path to the train and the test data
train_data_file = data_path + 'train_data.txt'
test_data_file = data_path + 'test_data.txt'

# Processing Data

In [None]:
def process_data(data_file):
    """
    Function to process the given data and spliit it in the respective required columns
    Arguments:
        data_file: Provided data in the given format
    """
    label_map = {'CLOSED': 0, 'OPEN': 1, 'STATIONARY': 2}
    
    processed_data = []
    with open(data_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            content = line.split(',')
            ax = content[0].split(" ")[2][1:]
            gz = content[5][:-1]
            ay,az,gx,gy = content[1:5]
            label = str(label_map[content[-1][:-1].strip()])
            train_sample = [ax,ay,az,gx,gy,gx,label]

            processed_data.append(list(map(lambda x: float(x.strip()), train_sample)))
    
    return processed_data

In [None]:
def remove_stationary_values(processed_data):
    """
    Function to remove the Stationary values from the data i.e. not required for the training of the model
    Arguments:
        processed_data: Processed data in the form of 2D array
    """
    data = collections.defaultdict(list)
    start = False
    for observation in processed_data:
        label =  observation[-1] 
        if label != 2.0:
            if not start:
                start = True
                data[label].append([])
            data[label][-1].append(observation[:-1])
        else:
            start = False
            continue
    return data

In [None]:
def remove_unncessary_values(data):
    """
    Function to remove the Unnecessary values and the noise from the data
    Arguments:
        data: data in the form of 2D array
    """
    for key, value in data.items():
        rem_index = []
        for i, observation in enumerate(value):
    #         print(len(observation))
            if len(observation) < 5:
                rem_index.append(i)
        data[key] = delete_multiple_element(data[key], rem_index)

In [None]:
def delete_multiple_element(list_object, indices):
    """
    Function to delete the multiple elements present in the data
    Arguments:
        list_object: list of object
        indices: indices to be deleted
    """
    indices = sorted(indices, reverse=True)
    for idx in indices:
        if idx < len(list_object):
            list_object.pop(idx)
    return list_object

In [None]:
def get_samples_and_labels(data_file):
    """
    Function to create the samples and the labels from the given data file
    Arguments:
        data_file: path to the file
    """
    
    processed_data = process_data(data_file)
    data = remove_stationary_values(processed_data)
    remove_unncessary_values(data)

    samples =  data[1.0] + data[0.0]
    labels = [1]*len(data[1.0]) + [0]*len(data[0.0])
    
    return samples, labels

In [None]:
# Fetching the samples and the labesl for the training data
samples, labels = get_samples_and_labels(train_data_file)

In [None]:
# samples[0][0]

In [None]:
# collections.Counter(labels)

# Feature Extraction

In [None]:
def feature1(window):
    """
    Function to fetch 3 featured from the given window
        Feature 1: Mean of Accelerometer x
        Feature 2: Mean of Accelerometer y
        Feature 3: Mean of Gyroscope z
    Arguments:
        window: window for the feature extraction
    """
    if len(window) == 0:
        return None
    feature = []
    mean_values = np.mean(window, axis=0)
    
    feature.append(mean_values[0])
    feature.append(mean_values[1])
    feature.append(mean_values[5])
    
    return feature

def feature2(window):
    """
    Function to fetch 4 featured from the given window
        Feature 1: Mean of Accelerometer Values
        Feature 2: Mean of Gyroscope Values
        Feature 3: Standard Deviation of Accelerometer Values
        Feature 4: Standard Deviation of Gyroscope Values
    Arguments:
        window: window for the feature extraction
    """
    feature = []
    
    feature.append(np.mean(window[:,0:3]))
    feature.append(np.mean(window[:,3:6]))
    feature.append(np.std(window[:,0:3]))
    feature.append(np.std(window[:3:6]))
    
    return feature

In [None]:
def select_features_from_window(window):
    """
    Function fetch main features from the given window
    Arguments:
        window: window for the feature extraction
    """
    
    f = feature1(window)
#     f = feature2(window)
    return f

In [None]:
def create_feature_vector(sample, splits):
    """
    Function to create the feature vector from the given data and creating the features
    Arguments:
        sample: sample data for the feature extraction
        splits: number of splits in which the data is to be splitted
    """
    
    feature_vector = []
    if len(sample) < splits:
#         feature_vector.append(select_features_from_window(np.array(sample)))
        return feature_vector
    to_split_data = np.array(sample[:int(len(sample)//splits)*splits])
    splitted_samples = np.split(to_split_data,splits)
    for i, window in enumerate(splitted_samples):
        if i == len(splitted_samples)-1 and len(sample[int(len(sample)//splits)*splits:]) > 0: 
            window = np.vstack((window, sample[int(len(sample)//splits)*splits:]))
        features = select_features_from_window(window)
        if features:
            feature_vector.append(features)

    debug = ' '.join([str(len(x)) for x in feature_vector])
#     print(len(feature_vector), " ", debug)
    return feature_vector

In [None]:
def create_data(samples, splits):
    """
    Function to create the data from the given samples into the required splits
    Arguments:
        samples: sample data for the feature extraction
        splits: number of splits in which the data is to be splitted
    """
    data = []
    i = 0
    for sample in samples:
        f_vector = create_feature_vector(sample, splits)
        if f_vector != []:
            data.append(f_vector)
#     print(data)
    data = np.array(data)
#     print(data.shape)
    data = data.reshape(data.shape[0], data.shape[1]*data.shape[2])
#     print(data.shape)
    
    return data

# Model Identification, Training and Testing

## Hyperparameter Tuning

In [None]:
# Function to perform grid search with pca, c, gamma values
def perform_gird_search(x_train, y_train):
    """
    Function to perform grid search with pca, c, gamma values
    Arguments:
        x_train: x values for the data values
        y_train: labels values for the data values
    """
    
    pipeline = Pipeline(
        [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('SVM', SVC(kernel='rbf', random_state=1234))]
    )
    
    check_params= {
        'pca__n_components': [2,4], 
        'SVM__C': [0.05,0.1,0.5, 1],
        'SVM__gamma' : [0.1, 0.5]
    }
    
    grid = GridSearchCV(pipeline, check_params, refit=True, verbose=1, cv=3)
    grid.fit(x_train, y_train)
    best_params = grid.best_params_
    
    return best_params, grid.best_estimator_

In [None]:
random_state_seed = 1234

In [None]:
num_window_to_best_params = {}

for numw in range(3,5):
    print("Tuning data for window size: {}".format(numw))
    
    data = create_data(samples, numw)
    data_shuffled, labels_shuffled = shuffle(data, labels, random_state=random_state_seed)
    bparams, best_model = perform_gird_search(data_shuffled, labels_shuffled)
    num_window_to_best_params[numw] = bparams
    
    print(best_model.score(data, labels))

Tuning data for window size: 3
Fitting 3 folds for each of 16 candidates, totalling 48 fits
1.0
Tuning data for window size: 4
Fitting 3 folds for each of 16 candidates, totalling 48 fits
1.0


In [None]:
for window_size, parameters in num_window_to_best_params.items():
    print("Best parameters for Window Size: {}".format(window_size))
    print(parameters)

Best parameters for Window Size: 3
{'SVM__C': 0.05, 'SVM__gamma': 0.1, 'pca__n_components': 2}
Best parameters for Window Size: 4
{'SVM__C': 0.05, 'SVM__gamma': 0.1, 'pca__n_components': 2}


## Model Training

In [None]:
best_window_size = 3

best_pca__n_components = num_window_to_best_params[3]["pca__n_components"]
best_SVM__C = num_window_to_best_params[3]["SVM__C"]
best_SVM__gamma = num_window_to_best_params[3]["SVM__gamma"]

In [None]:
pipeline = Pipeline(
    [('scaler', StandardScaler()),
     ('pca', PCA(n_components = best_pca__n_components)),
     ('SVM', SVC(kernel='rbf', random_state=random_state_seed,C = best_SVM__C, gamma=best_SVM__gamma,))]
)

data = create_data(samples, best_window_size)
data_shuffled, labels_shuffled = shuffle(data, labels, random_state=random_state_seed)

pipeline.fit(data_shuffled, labels_shuffled)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('SVM', SVC(C=0.05, gamma=0.1, random_state=1234))])

In [None]:
pipeline.score(data, labels)

1.0

In [None]:
confusion_matrix(pipeline.predict(data), labels)

array([[55,  0],
       [ 0, 54]])

## Model Testing

In [None]:
test_samples, test_labels = get_samples_and_labels(test_data_file)

In [None]:
test_data = create_data(test_samples, best_window_size)

In [None]:
shuffled_test_data, shuffled_test_labels = shuffle(test_data, test_labels, random_state = random_state_seed)
pipeline.score(shuffled_test_data, shuffled_test_labels)

1.0

In [None]:
confusion_matrix(pipeline.predict(shuffled_test_data), shuffled_test_labels)

array([[7, 0],
       [0, 6]])