In [75]:
import pandas as pd
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [76]:
###Change the data file directory below appropriately
data = pd.read_csv('data/0A986513-7828-4D53-AA1F-E02D6DF9561B.features_labels.csv')
data.head()

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:STAIRS_-_GOING_DOWN,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source
0,1449601597,1.000371,0.007671,-0.016173,0.02786,0.998221,1.000739,1.003265,0.891038,6.684582,...,,,,,,,,,,-1
1,1449601657,1.000243,0.003782,-0.002713,0.007046,0.998463,1.000373,1.002088,1.647929,6.684605,...,,,,,,,,,,-1
2,1449601717,1.000811,0.002082,-0.001922,0.003575,0.999653,1.000928,1.002032,1.960286,6.68461,...,,,,,,,,,,-1
3,1449601777,1.001245,0.004715,-0.002895,0.008881,0.999188,1.001425,1.0035,1.614524,6.684601,...,,,,,,,,,,-1
4,1449601855,1.001354,0.065186,-0.09652,0.165298,1.000807,1.002259,1.003631,0.83779,6.682252,...,0.0,,0.0,1.0,,,,,0.0,2


# Interpolating acceleration columns with average values

In [77]:
def interpolation(df):
    col_to_avg = list(df.columns) #Start with keeping all the columns as columns to use an average interpolation on
    for k in range(len(list(df.columns))):
        if list(df.columns)[k].startswith(('discrete', 'label')): #Remove label and discrete columns from col_to_avg
            col_to_avg.remove(list(df.columns)[k])
    
    df_with_avg = df[col_to_avg].fillna(df[col_to_avg].mean()) #Interpolate nan columns for all continuous-valued columns with average
    
    col_to_zero = list(df.columns)
    for k in range(len(list(df.columns))):
        if not list(df.columns)[k].startswith(('discrete', 'label')): #Remove all columns except label and discrete
            col_to_zero.remove(list(df.columns)[k])
    
    df_with_zero = df[col_to_zero].fillna(0) #Interpolate nan values for label and discrete columns with 0
    
    return pd.concat([df_with_avg, df_with_zero], axis = 1)

In [78]:
X = data.iloc[:,1:27]
y = data[['label:SITTING']]

X = interpolation(X)
y = interpolation(y)

X = X[y['label:SITTING'] == 1].reset_index() ### Select samples of acceleration where the person is sitting
X.drop(columns = ['index'], inplace = True)
y = y[y['label:SITTING'] == 1].reset_index()
y.drop(columns = ['index'], inplace = True)

X = X.values
y = y.values

In [79]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X = mm.fit_transform(X)
X

array([[0.03055181, 0.11061779, 0.12862242, ..., 0.44251652, 0.73594585,
        0.48495746],
       [0.03274136, 0.11517124, 0.39588783, ..., 0.61975644, 0.43366874,
        0.22504885],
       [0.03146445, 0.0019387 , 0.22650965, ..., 0.61342338, 0.47548982,
        0.7131712 ],
       ...,
       [0.03284124, 0.04673182, 0.2552768 , ..., 0.76676471, 0.64817891,
        0.5024305 ],
       [0.03297246, 0.00750709, 0.22582391, ..., 0.45546619, 0.73917111,
        0.77494757],
       [0.03262581, 0.00437283, 0.23225078, ..., 0.58757284, 0.49265173,
        0.4306445 ]])

In [80]:
train_features = torch.tensor(X)
train_labels = torch.tensor(y)
batch_size = 60

train_data = torch.utils.data.TensorDataset(train_features, train_labels)
train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True) ### 66 batches

In [85]:
def generator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.BatchNorm1d(output_dim),
        nn.ReLU(inplace = True)
    )
def get_noise(n_samples, z_dim):
    return torch.randn(n_samples, z_dim)

class Generator(nn.Module):
    def __init__(self, z_dim = 10, feature_dim = 26, hidden_dim = 128):
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            generator_block(z_dim, hidden_dim * 2),
            generator_block(hidden_dim * 2, hidden_dim),
            generator_block(hidden_dim, int(hidden_dim * 0.5)),
            nn.Linear(int(hidden_dim * 0.5), feature_dim),
            #generator_block(int(hidden_dim * 0.5), feature_dim),
            nn.Sigmoid()
            
        )
    def forward(self, noise):
        return self.gen(noise)

def discriminator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.LeakyReLU(0.2),
        nn.Dropout()
    )

class Discriminator(nn.Module):
    def __init__(self, feature_dim = 26, hidden_dim = 128):
        super(Discriminator, self).__init__()
        self.disc = nn.Sequential(
            discriminator_block(feature_dim, hidden_dim),
            discriminator_block(hidden_dim, int(hidden_dim * 0.5)),
            discriminator_block(int(hidden_dim * 0.5), hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, feature_vector):
        return self.disc(feature_vector)

def get_disc_loss(gen, disc, criterion, real_features, batch_size, z_dim):
    latent_vectors = get_noise(batch_size, z_dim)
    fake_features = gen(latent_vectors)
    pred_fake = disc(fake_features.detach())
    
    ground_truth = torch.zeros_like(pred_fake)
    loss_fake = criterion(pred_fake, ground_truth)
    
    pred_real = disc(real_features)
    ground_truth = torch.ones_like(pred_real)
    loss_real = criterion(pred_real, ground_truth)
    
    disc_loss = (loss_fake + loss_real) / 2
    
    return disc_loss

def get_gen_loss(gen, disc, criterion, batch_size, z_dim):
    latent_vectors = get_noise(batch_size, z_dim)
    fake_features = gen(latent_vectors)
    pred = disc(fake_features)
    gen_loss = criterion(pred, torch.ones_like(pred))
    
    return gen_loss

In [86]:
criterion = nn.BCEWithLogitsLoss()
n_epochs = 50
z_dim = 100
lr = 0.000001

disc = Discriminator()
gen = Generator(z_dim)

opt_disc = optim.Adam(disc.parameters(), lr = lr)
opt_gen = optim.Adam(gen.parameters(), lr = lr)

# Training Loop

In [83]:
class Classifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_size)
        )
    def forward(self, x):
        return torch.sigmoid(self.classifier(x))

def get_fake_samples(gen, batch_size, z_dim):
    """
    Generates fake acceleration features given a batch size, latent vector dimension, and trained generator.
    
    """
    latent_vectors = get_noise(batch_size, z_dim) ### Retrieves a 2D tensor of noise
    fake_features = gen(latent_vectors)
    
    return fake_features ### Returns a 2D tensor of fake features of size batch_size x z_dim
    

def create_fake_dataset(gen, z_dim, data):
    '''
    Creates a training/test set with 50% fake sitting features and 50% real non-sitting features.
    '''
    ### Retrieve random real samples where the user wasn't sitting
    X = interpolation(data.iloc[:,1:27]) 
    y = interpolation(data[['label:SITTING']]) 
    X = X[y['label:SITTING'] == 0].reset_index().drop(columns = ['index']) ### Selects non-sitting features
    y = y[y['label:SITTING'] == 0].reset_index().drop(columns = ['index'])
    
    X = X.values ### Converts the dataframes into arrays
    y = y.values 
    
    X_real, _, _, _ = train_test_split(X, y, test_size = 0.2) ### Use 80% of real non-sitting samples
    
    X_real_length = len(X_real) ### Storing the length to create an equal number of fake samples
    
    X_real = torch.tensor(X_real) ### All real samples where the user wasn't sitting
    
    fake_sitting = get_fake_samples(gen, X_real_length, z_dim)
    
    dataset = torch.cat((fake_sitting, X_real), dim = 0).detach()
    
    one_labels = torch.ones(len(fake_sitting), 1) ### 1s correspond to sitting
    zero_labels = torch.zeros(len(X_real), 1) ### 0s correspond to not sitting
    labels = torch.cat((one_labels, zero_labels), dim = 0).detach()
    
    ### Splitting into training and testing sets
    dataset = dataset.numpy()
    labels = labels.numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size = 0.2)
    
    ### Converting to tensors
    X_train = torch.tensor(X_train)
    X_test = torch.tensor(X_test)
    y_train = torch.tensor(y_train)
    y_test = torch.tensor(y_test)
    
    return X_train, y_train, X_test, y_test

def create_real_dataset(data):
    """
    Returns a train/test split of the real dataset.
    """
    ### Not a guaranteed 50-50 split between sitting and not sitting
    X = interpolation(data.iloc[:,1:27])
    y = interpolation(data[['label:SITTING']])

    ### Converting dataframe to arrays
    X = X.values
    y = y.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    ### Converting to tensors
    X_train = torch.tensor(X_train)
    X_test = torch.tensor(X_test)
    y_train = torch.tensor(y_train)
    y_test = torch.tensor(y_test)
    
    return X_train, y_train, X_test, y_test
    
def create_dataloader(X_train, y_train, X_test, y_test, batch_size):
    """
    Creates the train_loader and test_loader iterables.
    """
    train_data = torch.utils.data.TensorDataset(X_train, y_train)
    test_data = torch.utils.data.TensorDataset(X_test, y_test)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True) 
    test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size, shuffle = True)
    return train_loader, test_loader

def train_model_fake(gen, criterion, n_epochs, z_dim, data, batch_size, input_size, output_size):
    """
    Trains a classifier on a combination of real and fake training examples. Evaluates it on real testing examples.
    """
    ### Create a training/test set with fake features
    X_train, y_train, X_test, y_test = create_fake_dataset(gen, z_dim, data)
    train_loader, test_loader = create_dataloader(X_train, y_train, X_test, y_test, batch_size)
    
    ### Instantiate the model and optimizer
    model = Classifier(input_size, output_size)
    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            features, labels = batch
            y_preds = model(features.float())
            loss = criterion(y_preds, labels.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        print(f'Epoch: {epoch + 1} | Total Batch Loss: {total_loss}')
        
    ### Evalute Model's Performance
    evaluate_model(X_test, y_test, test_loader, model)

def train_model_real(criterion, n_epochs, data, batch_size):
    """
    Trains and evaluates a classifier on only real training and testing examples.
    """
    X_train, y_train, X_test, y_test = create_real_dataset(data)
    train_loader, test_loader = create_dataloader(X_train, y_train, X_test, y_test, batch_size)
    
    model = Classifier(input_size, output_size)
    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            features, labels = batch
            y_preds = model(features.float())
            loss = criterion(y_preds, labels.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        print(f'Epoch: {epoch + 1} | Total Batch Loss: {total_loss}')
        
    ### Evalute Model's Performance
    evaluate_model(X_test, y_test, test_loader, model)

def evaluate_model(X_test, y_test, test_loader, model):
    """
    Returns the classification accuracy, precision, recall, and F-1 score of a model.
    """
    total_wrong = 0
    positive_preds = 0 
    true_positives = 0
    false_negatives = 0

    with torch.no_grad():
        for X_test_data, y_test in test_loader:
            y_test_preds = model(X_test_data.float())
            y_test_preds = torch.round(y_test_preds)

            for k in range(len(y_test_preds)):
                if y_test_preds[k].item() == 1:
                    positive_preds += 1
                if y_test_preds[k].item() == y_test[k].item() == 1:
                    true_positives += 1
                if y_test_preds[k].item() == 0 and y_test[k].item() == 1:
                    false_negatives += 1

            current_wrong = (abs(y_test_preds - y_test)).sum().item()
            total_wrong += current_wrong

        class_acc = (len(X_test) - total_wrong) / len(X_test) * 100
        precision = true_positives / positive_preds
        recall = true_positives / (true_positives + false_negatives)

        print(f'Classification Accuracy: {class_acc:.2f}')
        print(f'Precision: {precision:.2f}') #What percentage of a model's positive predictions were actually positive
        print(f'Recall: {recall:.2f}') #What percent of the true positives were identified
        print(f'F-1 Score: {2*(precision * recall / (precision + recall)):.2f}')

In [66]:
loss_function = nn.BCELoss()
n_epochs = 20
z_dim = 100
batch_size = 50
input_size = 26
output_size = 1

train_model_fake(gen, loss_function, n_epochs, z_dim, data, batch_size, input_size, output_size)
train_model_real(loss_function, n_epochs, data, batch_size)

Epoch: 1 | Total Batch Loss: 10.670419368892908
Epoch: 2 | Total Batch Loss: 0.2582301759393886
Epoch: 3 | Total Batch Loss: 0.048022057802882046
Epoch: 4 | Total Batch Loss: 0.02718917521997355
Epoch: 5 | Total Batch Loss: 0.017476253357017413
Epoch: 6 | Total Batch Loss: 0.01221564230218064
Epoch: 7 | Total Batch Loss: 0.008981192964711227
Epoch: 8 | Total Batch Loss: 0.006847762182587758
Epoch: 9 | Total Batch Loss: 0.0053979304793756455
Epoch: 10 | Total Batch Loss: 0.004357019912276883
Epoch: 11 | Total Batch Loss: 0.0035702220375242177
Epoch: 12 | Total Batch Loss: 0.0029781910379824694
Epoch: 13 | Total Batch Loss: 0.002517574088415131
Epoch: 14 | Total Batch Loss: 0.00215756164288905
Epoch: 15 | Total Batch Loss: 0.0018641315637069056
Epoch: 16 | Total Batch Loss: 0.0016313301921400125
Epoch: 17 | Total Batch Loss: 0.001433808663932723
Epoch: 18 | Total Batch Loss: 0.001271617616112053
Epoch: 19 | Total Batch Loss: 0.0011284047868684866
Epoch: 20 | Total Batch Loss: 0.001019256

In [71]:
criterion = nn.BCEWithLogitsLoss()
n_epochs = 20
z_dim = 100
lr = 0.000001

disc = Discriminator()
gen = Generator(z_dim)

opt_disc = optim.Adam(disc.parameters(), lr = lr)
opt_gen = optim.Adam(gen.parameters(), lr = lr)

In [72]:
for epoch in range(n_epochs):
    for batch_idx, (real_features, _) in enumerate(train_loader):
        batch_size = len(real_features)
        
        ### Training Discriminator
        for k in range(5):
            opt_disc.zero_grad()
            disc_loss = get_disc_loss(gen, disc, criterion, real_features.float(), batch_size, z_dim)
            disc_loss.backward(retain_graph = True)
            opt_disc.step()

        ### Training Generator
        opt_gen.zero_grad()
        gen_loss = get_gen_loss(gen, disc, criterion, batch_size, z_dim)
        gen_loss.backward()
        opt_gen.step()
        
        if batch_idx == 0:
            print(
                f'Epoch [{epoch + 1} / {n_epochs}] Loss D: {disc_loss.item():.4f}, Loss G: {gen_loss.item():.4f} '
            )
            
    if (epoch + 1) % 10 == 0:
        train_model_fake(gen, loss_function, n_epochs, z_dim, data, batch_size, input_size, output_size)
        train_model_real(loss_function, n_epochs, data, batch_size)
            
    

Epoch [1 / 20] Loss D: 0.6909, Loss G: 0.6937 
Epoch [2 / 20] Loss D: 0.7011, Loss G: 0.6908 
Epoch [3 / 20] Loss D: 0.6981, Loss G: 0.6927 
Epoch [4 / 20] Loss D: 0.6946, Loss G: 0.6946 
Epoch [5 / 20] Loss D: 0.6908, Loss G: 0.6877 
Epoch [6 / 20] Loss D: 0.6897, Loss G: 0.6928 
Epoch [7 / 20] Loss D: 0.6855, Loss G: 0.6944 
Epoch [8 / 20] Loss D: 0.6926, Loss G: 0.6924 
Epoch [9 / 20] Loss D: 0.6892, Loss G: 0.6962 
Epoch [10 / 20] Loss D: 0.6954, Loss G: 0.6894 
Epoch: 1 | Total Batch Loss: 12.186321257497184
Epoch: 2 | Total Batch Loss: 0.11870711937081069
Epoch: 3 | Total Batch Loss: 0.03152973535907222
Epoch: 4 | Total Batch Loss: 0.015313363255700096
Epoch: 5 | Total Batch Loss: 0.008897444244212238
Epoch: 6 | Total Batch Loss: 0.0059209794490016066
Epoch: 7 | Total Batch Loss: 0.004194332048427896
Epoch: 8 | Total Batch Loss: 0.0031090115753613645
Epoch: 9 | Total Batch Loss: 0.002444064371047716
Epoch: 10 | Total Batch Loss: 0.0019151010828863946
Epoch: 11 | Total Batch Loss: