In [None]:
!pip install --upgrade pip 
!pip install watermark
!(cd .. && pip install -e . )# install the package in editable mode

In [None]:
%reload_ext watermark
%watermark -p torch,pandas,matplotlib,scikit-learn,seaborn,scipy -v -m

In [None]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, DataLoader
import warnings

warnings.filterwarnings("ignore")


### Load data from the data directory in train & test sets

Labels are:

    - 0 Walking
    - 1 Walking upstairs
    - 2 Walking downstairs
    - 3 Sitting
    - 4 Standing
    - 5 Laying

In [None]:
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values

def load_group(filenames, prefix=''):
    loaded = list()
    for name in filenames:
        data = load_file(prefix + name)
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.dstack(loaded)
    return loaded

# load a dataset group, such as train or test
def load_dataset_group(group, prefix=''):
    filepath = prefix + group + '/Inertial Signals/'
    # load all 9 files as a single array
    filenames = list()
    # total acceleration
    filenames += ['total_acc_x_'+group+'.txt', 'total_acc_y_'+group+'.txt', 'total_acc_z_'+group+'.txt']
    # body acceleration
    filenames += ['body_acc_x_'+group+'.txt', 'body_acc_y_'+group+'.txt', 'body_acc_z_'+group+'.txt']
    # body gyroscope
    filenames += ['body_gyro_x_'+group+'.txt', 'body_gyro_y_'+group+'.txt', 'body_gyro_z_'+group+'.txt']
    # load input data
    X = load_group(filenames, filepath)
    # load class output
    y = load_file(prefix + group + '/y_'+group+'.txt')
    return X, y

# load the dataset, returns train and test X and y elements
def load_dataset(prefix=''):
    # load all train
    trainX, trainy = load_dataset_group('train', prefix + 'UCI HAR Dataset/')
    print(trainX.shape, trainy.shape)
    # load all test
    testX, testy = load_dataset_group('test', prefix + 'UCI HAR Dataset/')
    print(testX.shape, testy.shape)
    # zero-offset class values
    trainy = trainy - 1
    testy = testy - 1
    # one hot encode y
    #trainy = to_categorical(trainy)
    #testy = to_categorical(testy)
    #print(trainX.shape, trainy.shape, testX.shape, testy.shape)
    return trainX, trainy, testX, testy


train_X, train_y, test_X, test_y = load_dataset(prefix="../raw_data/")

n_timesteps, n_features, n_outputs = train_X.shape[1], train_X.shape[2], train_y.shape[1]

print("n_timesteps: ", n_timesteps)
print("n_features: ", n_features)
print("n_outputs: ", n_outputs)

### (Not used) Let create windows of 5 seconds every 1 second

In [None]:
data = []
target = []

window_size = 100
step_size = 50

# creating overlaping windows of size window-size 100
for i in range(0, df_train.shape[0] - window_size, step_size):
    xs = df_train['x-axis'].values[i: i + window_size]
    ys = df_train['y-axis'].values[i: i + window_size]
    zs = df_train['z-axis'].values[i: i + window_size]
    label = stats.mode(df_train['activity'][i: i + window_size])[0][0]

    data.append([xs, ys, zs])
    target.append(label)

train_size = len(data)
    
for i in range(0, df_test.shape[0] - window_size, step_size):
    xs = df_test['x-axis'].values[i: i + window_size]
    ys = df_test['y-axis'].values[i: i + window_size]
    zs = df_test['z-axis'].values[i: i + window_size]
    label = stats.mode(df_test['activity'][i: i + 100])[0][0]

    data.append([xs, ys, zs])
    target.append(label)

#### Check distribution of test labels

In [None]:
sns.histplot(test_y);

In [None]:
train_X.shape

In [None]:
train_X = np.transpose(train_X, (0, 2, 1))
train_X.shape

In [None]:
test_X = np.transpose(test_X, (0, 2, 1))
test_X.shape

In [None]:
train_y, test_y = train_y.flatten(), test_y.flatten()

From here onwards, we could transform each 5sec window into a feature vector to eliminate time dependency. We could use the following features:

    - mean of x, y, z,
    - standard deviation of x, y, z,
    - ...

Our preference goes to directly use the time series as input to a neural network. We will use a transformer to predict the activity class.

### My PYTORCH model

In [None]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# from sklearn.preprocessing import LabelEncoder  

# le = LabelEncoder()
# target = le.fit_transform(target)

In [None]:
def create_datasets(X, y, valid_pct=0.1, seed=None):
    """Converts NumPy arrays into PyTorch datsets.
    
    Three datasets are created in total:
        * training dataset
        * validation dataset
        * testing (un-labelled) dataset

    """
    tv_X, test_X = X
    tv_y, test_y = y
    
    sz = tv_X.shape[0]
    
    idx = np.arange(sz)
    trn_idx, val_idx = train_test_split(idx, test_size=valid_pct, random_state=seed)
    
    print("trn_idx.shape: ", trn_idx.shape)
    print("val_idx.shape: ", val_idx.shape)
    
    valid_X = tv_X[val_idx]
    valid_y = tv_y[val_idx]
    
    train_X = tv_X[trn_idx]
    train_y = tv_y[trn_idx]
    
    print("train_X.shape: ", train_X.shape)
    print("train_y.shape: ", train_y.shape)
    print("valid_X.shape: ", valid_X.shape)
    print("valid_y.shape: ", valid_y.shape)
    
    trn_ds = TensorDataset(
        torch.tensor(train_X).float(), 
        torch.tensor(train_y).long())
    val_ds = TensorDataset(
        torch.tensor(valid_X).float(), 
        torch.tensor(valid_y).long())
    tst_ds = TensorDataset(
        torch.tensor(test_X).float(), 
        torch.tensor(test_y).long())
    return trn_ds, val_ds, tst_ds

def create_loaders(data, bs=128, jobs=0):
    """Wraps the datasets returned by create_datasets function with data loaders."""
    
    trn_ds, val_ds, tst_ds = data
    trn_dl = DataLoader(trn_ds, batch_size=bs, shuffle=True, num_workers=jobs)
    val_dl = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    tst_dl = DataLoader(tst_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    return trn_dl, val_dl, tst_dl

In [None]:
from ml.models.classifiers.conv1d import Classifier

In [None]:
datasets = create_datasets((train_X, test_X), (train_y, test_y), seed=42)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
train_size = train_X.shape[0]
nb_feat = train_X.shape[1]

trn_dl, val_dl, tst_dl = create_loaders(datasets, bs=128)

print(len(trn_dl), len(val_dl), len(tst_dl))

lr = 0.001
n_epochs = 1500
iterations_per_epoch = len(trn_dl)
num_classes = 6
best_acc = 0
patience, trials = 200, 0
base = 1
step = 2
loss_history = []
acc_history = []

model = Classifier(nb_feat, num_classes).to(device)
                                                                                        
criterion = nn.CrossEntropyLoss(reduction='sum')
# criterion = nn.NLLLoss()

opt = optim.Adam(model.parameters(), lr=lr)

print('Start model training')

for epoch in range(1, n_epochs + 1):
    
    model.train()
    epoch_loss = 0
    total = 0
    for i, batch in enumerate(trn_dl):
        # for t in batch:
        #     print(t.shape, t.dtype)
        x_batch, y_batch = [t.to(device) for t in batch]
        opt.zero_grad()
        out = model(x_batch)
        loss = criterion(out, y_batch)
        epoch_loss += loss.item()
        loss.backward()
        opt.step()
        total += y_batch.size(0)
                
    epoch_loss /= train_size
    loss_history.append(epoch_loss)

    model.eval()
    correct, total = 0, 0
    for batch in val_dl:
        x_batch, y_batch = [t.to(device) for t in batch]
        out = model(x_batch)
        preds = F.log_softmax(out, dim=1).argmax(dim=1)
        total += y_batch.size(0)
        correct += (preds == y_batch).sum().item()
    
    acc = correct / total
    acc_history.append(acc)

    if epoch % base == 0:
        print(f'Epoch: {epoch:3d} Loss: {epoch_loss:.4f}. Acc.: {acc:2.2%}')
        base *= step

    if acc > best_acc:
        trials = 0
        best_acc = acc
        torch.save(model.state_dict(), '../models/best.pth')
        print(f'Epoch: {epoch:3d} best model saved with accuracy: {best_acc:2.2%}')
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break
            
print('Done!')

In [None]:
def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

In [None]:
f, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].plot(loss_history, label='loss')
ax[0].set_title('Validation Loss History')
ax[0].set_xlabel('Epoch no.')
ax[0].set_ylabel('Loss')

ax[1].plot(smooth(acc_history, 5)[:-2], label='acc')
ax[1].set_title('Validation Accuracy History')
ax[1].set_xlabel('Epoch no.')
ax[1].set_ylabel('Accuracy');

### Using test set to evaluate model

In [None]:
model = Classifier(nb_feat, num_classes).to(device)

model.load_state_dict(torch.load('../models/best.pth'))
model.eval()

correct, total = 0, 0
for batch in val_dl:
    x_batch, y_batch = [t.to(device) for t in batch]
    out = model(x_batch)
    y_hat = F.log_softmax(out, dim=1).argmax(dim=1)
    total += y_batch.size(0)
    correct += (y_hat == y_batch).sum().item()
    
print(f"Test accuracy: {correct / total:2.2%}")

In [None]:
first_batch = next(iter(tst_dl))
x_batch, y_batch = first_batch[0], first_batch[0][1]
x = x_batch[0].to(device)
y = y_batch[0].to(device)
        
x_test = x.reshape((1, 9, 128))
    
y_test = F.log_softmax(model(x_test.to(device)), dim=1).argmax(dim=1)
print(y_test)

### Create an MLFlow model

In [None]:
!pip install mlflow"[skinny]"

In [None]:
import cloudpickle
import platform
import os
import shutil

from mlflow import __version__ as mlflow_version
from mlflow.pyfunc import PythonModel, save_model, load_model
# from mlflow.pytorch import save_model, load_model
from mlflow.models import Model, ModelSignature
from mlflow.models.signature import infer_signature

from pandas import __version__ as pd_version

from mlflow.pytorch import log_model, save_model, load_model
from torch import __version__ as pytorch_version

cp_version = cloudpickle.__version__
py_version = platform.python_version()
conda_env_template = """
channels:
- defaults
dependencies:
- python={python_version}
- pip
- pip:
  - cloudpickle=={cp_version}
  - mlflow=={mlflow_version}
  - pytorch=={pytorch_version}

name: mlflow-env
""".format(python_version=py_version, mlflow_version=mlflow_version, pytorch_version=pytorch_version, cp_version=cp_version)

conda_env_path = 'pyfunc_conda_env.yml'
with open('pyfunc_conda_env.yml','w') as f:
    f.write(conda_env_template)

state_dict_path = f'../models/best.pth'
artifacts = {
    "state_dict_model": state_dict_path,
}

class ModelWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        from ml.models.classifiers.conv1d import Classifier
        
        self._model = Classifier(9, 6)
        self._model.load_state_dict(torch.load(context.artifacts["state_dict_model"], map_location="cpu"))
        self._model.eval()
    
    def predict(self, context, input_model):
        from torch import functional as F
        
        model_pred = self._model(inputs).detach().cpu().numpy()
        
        predictions = F.log_softmax(out, dim=1).argmax(dim=1)
        
        pred_results = predictions[0].tolist()
        
        results = {
            "class": [pred_results[0]],
        }
        return pd.DataFrame.from_dict(results)
    
# model = ModelWrapper()
# context = mlflow.pyfunc.PythonModelContext(artifacts=artifacts)
# print(context)
# model.load_context(context=context)

signature = infer_signature(x_test.numpy(), y_test.numpy())

## Package the model
# Location in our gdrive where we want the model to be saved
model_path = f"/tmp/model"
# If the directory already exists, delete it
if os.path.exists(model_path):
    shutil.rmtree(model_path)

# Package the model!
mlflow.pyfunc.save_model(path=model_path,
           python_model=ModelWrapper(),
           artifacts=artifacts,
           conda_env=conda_env_path,
           code_path=['../ml/models/classifiers/conv1d.py',],
           signature=signature,)

# zip the model and save it to the local file named model.zip
shutil.make_archive(model_path, 'zip', model_path)
