In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import pickle

import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
from tqdm import notebook
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_score, recall_score


if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("Running on CUDA: ", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
  device = torch.device("cpu")
  print("Running on CPU")

Running on CUDA:  NVIDIA GeForce RTX 4070


In [2]:
#Load in training data
df = pd.read_csv('./data/movementSensorData.csv')

### Exploring the data

In [3]:
df.head()

Unnamed: 0,id,activity,time_s,lw_x,lw_y,lw_z
0,63804,2,638.05,-0.188,-0.941,-0.316
1,63805,2,638.06,-0.121,-0.879,-0.32
2,63806,2,638.07,-0.07,-0.852,-0.305
3,63807,2,638.08,-0.023,-0.879,-0.277
4,63808,2,638.09,0.008,-0.941,-0.242


In [4]:
df.shape

(507827, 6)

In [5]:
df.describe()

Unnamed: 0,id,activity,time_s,lw_x,lw_y,lw_z
count,507827.0,507827.0,507827.0,507827.0,507827.0,507827.0
mean,167785.10183,4.191809,1677.861018,-0.211302,-0.021941,-0.477602
std,63888.316941,8.785676,638.883169,0.52407,0.727952,0.443465
min,63804.0,1.0,638.05,-5.289,-5.305,-6.875
25%,89195.0,2.0,891.96,-0.734,-0.219,-0.828
50%,188844.0,4.0,1888.45,-0.098,0.184,-0.57
75%,220583.0,4.0,2205.84,0.176,0.426,-0.148
max,252322.0,77.0,2523.23,5.516,4.418,4.551


In [6]:
df.isnull().sum()

id          0
activity    0
time_s      0
lw_x        0
lw_y        0
lw_z        0
dtype: int64

In [7]:
df['activity'].unique()

array([ 2, 77,  1,  3,  4], dtype=int64)

So we have activites 1, 2, 3, 4, and 77. From the source (https://physionet.org/content/accelerometry-walk-climb-drive/1.0.0/#files) we know that these are:
- 1 Walking
- 2 Descending Stairs
- 3 Ascending Stairs
- 4 Driving
- 77 Clapping


### Preprocessing

In [8]:
scaler=StandardScaler()
scaler.fit(df)
scaler_train = scaler.transform(df)

In [9]:
X = df.iloc[:, 2:6] #time and data minus activity
y = df.iloc[:, 1] #just activity
print(X)
print(y)

        time_s   lw_x   lw_y   lw_z
0       638.05 -0.188 -0.941 -0.316
1       638.06 -0.121 -0.879 -0.320
2       638.07 -0.070 -0.852 -0.305
3       638.08 -0.023 -0.879 -0.277
4       638.09  0.008 -0.941 -0.242
...        ...    ...    ...    ...
507822  963.87 -0.012  0.984 -0.363
507823  963.88  0.016  0.938 -0.379
507824  963.89  0.039  0.910 -0.391
507825  963.90  0.066  0.898 -0.395
507826  963.91  0.105  0.895 -0.398

[507827 rows x 4 columns]
0         2
1         2
2         2
3         2
4         2
         ..
507822    1
507823    1
507824    1
507825    1
507826    1
Name: activity, Length: 507827, dtype: int64


In [10]:
#ts_cv = TimeSeriesSplit(
#    n_splits=5,
#    gap=48,
#    max_train_size=10000,
#    test_size=1000,
#)
#
#all_splits = list(ts_cv.split(X, y))
#train_0, test_0 = all_splits[0]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

### Scikit-Learn RandomForestClassifier

In [12]:
#random_forest = RandomForestClassifier(max_depth=16, random_state=1452, n_estimators=1000)
#random_forest.fit(X_train, y_train) 

In [13]:
#y_pred = random_forest.predict(X_validation)
#print('accuracy', metrics.accuracy_score(y_validation, y_pred))
#print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))
##accuracy 0.9750113226867259
##f1 0.9736860087875407

In [14]:
#with open('models/random_forest.pickle', 'wb') as rf_file:
#    pickle.dump(random_forest, rf_file)

### PyTorch NN


In [15]:
#Split train into training/validation
X_tensor = torch.from_numpy(X.values.astype(np.float32))
y_tensor = torch.from_numpy(y.values.astype(np.float32))
training, validation = torch.utils.data.random_split(torch.cat(tensors=(X_tensor, y_tensor)), [0.7, 0.3])

RuntimeError: Tensors must have same number of dimensions: got 2 and 1

In [None]:
class BaseMLP(nn.Module):
  def __init__(self, input_size, hidden_width, hidden_depth, activation_func,
               dropout=0, num_classes=5):
    super().__init__()

    operations = [
        nn.Linear(input_size, hidden_width), #input layer -> hidden layer
        #nn.BatchNorm1d(hidden_width),
        #nn.Dropout(dropout),
        activation_func
    ]

    if (hidden_depth > 2): #shrink_width used to scale down the width (number of nodes) per layer
      shrink_width = int(hidden_width / (hidden_depth - 1))

    for i in range(2, hidden_depth):
      next_width = int(hidden_width - shrink_width)
      operations.extend([
          nn.Linear(hidden_width, next_width), #hidden layer i -> hidden layer i + 1
          #nn.BatchNorm1d(next_width),
          #nn.Dropout(dropout),
          activation_func
      ])
      hidden_width = next_width

    operations.append(nn.Linear(hidden_width, num_classes)) #hidden layer -> output layer

    self.sequence = nn.Sequential(*operations)
  
  def forward(self, x):
    out = self.sequence(x)
    return out

In [None]:
baseline_learning_rate = 0.001
baseline_dropout = 0.0
baseline_hidden_width = 40
baseline_hidden_depth = 2
baseline_activation_func = nn.ReLU()

baseline_model = BaseMLP(4, baseline_hidden_width, 
                         baseline_hidden_depth, baseline_activation_func, 
                         dropout=baseline_dropout).to(device)

baseline_optimiser = torch.optim.Adam(baseline_model.parameters(), lr=baseline_learning_rate)

baseline_lr_scheduler = torch.optim.lr_scheduler.StepLR(baseline_optimiser, step_size=10)

In [None]:
num_epochs = 1

In [None]:
#def trainModel(num_epochs, model, data, criterion, optimizer, scheduler):
baseline_model.train() #model in train mode
epochs = notebook.tqdm(range(num_epochs))

loss_list = []
accuracy_list = []

for epoch in epochs:
  total_loss = 0
  correct = 0
  running_total = 0
  for i in range(len(training)):
    # move tensors to device (CPU or GPU)
    X_train_i = training[:-1][i].to(device)
    y_train_i = training[-1][i].to(device)

    #forward pass
    predictions = baseline_model(X_train_i)
    loss = nn.CrossEntropyLoss()
    print('predictions', predictions)
    print('y_train_i', y_train_i)
    loss(predictions, y_train_i)
    
    #backward pass, optimise
    loss.backward() #backprop the loss
    total_loss += loss.item()
    
    baseline_optimiser.step()
    baseline_optimiser.zero_grad()

    _, predicted = torch.max(predictions.data, 1)
    correct += (predicted == y_train_i).sum().item()
    running_total += y_train_i.size(0)

    if (i + 1) % 10 == 0: #update progress bar every 10 batches
      epochs.set_description(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i+1}/{len(training)}], Loss: {loss.item():.4f}")
    
    loss_list.append(total_loss)
    accuracy_list.append(100 * correct / running_total)
    baseline_lr_scheduler.step() #np.mean(loss_list)



  0%|          | 0/1 [00:00<?, ?it/s]

tensor([ 9.1655e+02, -3.0500e-01, -1.1800e+00, -1.4100e-01], device='cuda:0')
predictions tensor([ -27.1389,   33.7549, -172.7104,  -85.9163, -163.1038],
       device='cuda:0', grad_fn=<AddBackward0>)
y_train_i tensor(927.2100, device='cuda:0')


RuntimeError: "nll_loss_forward_reduce_cuda_kernel_1d_index" not implemented for 'Float'

In [None]:
#baseline_predictions, baseline_valid_accuracy = testModel(baseline_model, X_validation, y_validation)