In [117]:
import pandas as pd
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [118]:
###Change the data file directory below appropriately
data = pd.read_csv('data/0A986513-7828-4D53-AA1F-E02D6DF9561B.features_labels.csv')

# Interpolating acceleration columns with average values

In [119]:
def interpolation(df):
    col_to_avg = list(df.columns) #Start with keeping all the columns as columns to use an average interpolation on
    for k in range(len(list(df.columns))):
        if list(df.columns)[k].startswith(('discrete', 'label')): #Remove label and discrete columns from col_to_avg
            col_to_avg.remove(list(df.columns)[k])
    
    df_with_avg = df[col_to_avg].fillna(df[col_to_avg].mean()) #Interpolate nan columns for all continuous-valued columns with average
    
    col_to_zero = list(df.columns)
    for k in range(len(list(df.columns))):
        if not list(df.columns)[k].startswith(('discrete', 'label')): #Remove all columns except label and discrete
            col_to_zero.remove(list(df.columns)[k])
    
    df_with_zero = df[col_to_zero].fillna(0) #Interpolate nan values for label and discrete columns with 0
    
    return pd.concat([df_with_avg, df_with_zero], axis = 1)

In [120]:
X = data.iloc[:,1:27]
y = data[['label:SITTING']]

X = interpolation(X)
y = interpolation(y)

X = X[y['label:SITTING'] == 1].reset_index() ### Select samples of acceleration where the person is sitting
X.drop(columns = ['index'], inplace = True)
y = y[y['label:SITTING'] == 1].reset_index()
y.drop(columns = ['index'], inplace = True)

X = X.values
y = y.values

# Scaling the data into the range (0,1)

In [122]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X = mm.fit_transform(X)
train_features = torch.tensor(X)
train_labels = torch.tensor(y)
train_features

tensor([[0.0306, 0.1106, 0.1286,  ..., 0.4425, 0.7359, 0.4850],
        [0.0327, 0.1152, 0.3959,  ..., 0.6198, 0.4337, 0.2250],
        [0.0315, 0.0019, 0.2265,  ..., 0.6134, 0.4755, 0.7132],
        ...,
        [0.0328, 0.0467, 0.2553,  ..., 0.7668, 0.6482, 0.5024],
        [0.0330, 0.0075, 0.2258,  ..., 0.4555, 0.7392, 0.7749],
        [0.0326, 0.0044, 0.2323,  ..., 0.5876, 0.4927, 0.4306]],
       dtype=torch.float64)

In [None]:
#train_model_fake(gen, loss_function, n_epochs, z_dim, data, batch_size, input_size, output_size)
#train_model_real(loss_function, n_epochs, data, batch_size)

# Establishing discriminator and generator

In [115]:
def generator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.BatchNorm1d(output_dim),
        nn.ReLU(inplace = True)
    )
def get_noise(n_samples, z_dim):
    return torch.randn(n_samples, z_dim)

class Generator(nn.Module):
    def __init__(self, z_dim = 10, feature_dim = 26, hidden_dim = 128):
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            generator_block(z_dim, hidden_dim * 2),
            generator_block(hidden_dim * 2, hidden_dim),
            generator_block(hidden_dim, int(hidden_dim * 0.5)),
            nn.Linear(int(hidden_dim * 0.5), feature_dim),
            #generator_block(int(hidden_dim * 0.5), feature_dim),
            nn.Sigmoid()
            
        )
    def forward(self, noise):
        return self.gen(noise)

def discriminator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.LeakyReLU(0.2),
        nn.Dropout()
    )

class Discriminator(nn.Module):
    def __init__(self, feature_dim = 26, hidden_dim = 128):
        super(Discriminator, self).__init__()
        self.disc = nn.Sequential(
            discriminator_block(feature_dim, hidden_dim),
            discriminator_block(hidden_dim, int(hidden_dim * 0.5)),
            discriminator_block(int(hidden_dim * 0.5), hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, feature_vector):
        return self.disc(feature_vector)

def get_disc_loss(gen, disc, criterion, real_features, batch_size, z_dim):
    latent_vectors = get_noise(batch_size, z_dim)
    fake_features = gen(latent_vectors)
    pred_fake = disc(fake_features.detach())
    
    ground_truth = torch.zeros_like(pred_fake)
    loss_fake = criterion(pred_fake, ground_truth)
    
    pred_real = disc(real_features)
    ground_truth = torch.ones_like(pred_real)
    loss_real = criterion(pred_real, ground_truth)
    
    disc_loss = (loss_fake + loss_real) / 2
    
    return disc_loss

def get_gen_loss(gen, disc, criterion, batch_size, z_dim):
    latent_vectors = get_noise(batch_size, z_dim)
    fake_features = gen(latent_vectors)
    pred = disc(fake_features)
    gen_loss = criterion(pred, torch.ones_like(pred))
    
    return gen_loss

# Fake data quality test methods

In [116]:
class Classifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_size)
        )
    def forward(self, x):
        return torch.sigmoid(self.classifier(x))

def get_fake_samples(gen, batch_size, z_dim):
    """
    Generates fake acceleration features given a batch size, latent vector dimension, and trained generator.
    
    """
    latent_vectors = get_noise(batch_size, z_dim) ### Retrieves a 2D tensor of noise
    fake_features = gen(latent_vectors)
    
    return fake_features ### Returns a 2D tensor of fake features of size batch_size x z_dim
    

def create_fake_dataset(gen, z_dim, data):
    '''
    Creates a training/test set with 50% fake sitting features and 50% real non-sitting features.
    '''
    ### Retrieve random real samples where the user wasn't sitting
    X = interpolation(data.iloc[:,1:27]) 
    y = interpolation(data[['label:SITTING']]) 
    X = X[y['label:SITTING'] == 0].reset_index().drop(columns = ['index']) ### Selects non-sitting features
    y = y[y['label:SITTING'] == 0].reset_index().drop(columns = ['index'])
    
    X = X.values ### Converts the dataframes into arrays
    y = y.values 
    
    X_real, _, _, _ = train_test_split(X, y, test_size = 0.2) ### Use 80% of real non-sitting samples
    
    X_real_length = len(X_real) ### Storing the length to create an equal number of fake samples
    
    X_real = torch.tensor(X_real) ### All real samples where the user wasn't sitting
    
    fake_sitting = get_fake_samples(gen, X_real_length, z_dim)
    
    dataset = torch.cat((fake_sitting, X_real), dim = 0).detach()
    
    one_labels = torch.ones(len(fake_sitting), 1) ### 1s correspond to sitting
    zero_labels = torch.zeros(len(X_real), 1) ### 0s correspond to not sitting
    labels = torch.cat((one_labels, zero_labels), dim = 0).detach()
    
    ### Splitting into training and testing sets
    dataset = dataset.numpy()
    labels = labels.numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size = 0.2)
    
    ### Converting to tensors
    X_train = torch.tensor(X_train)
    X_test = torch.tensor(X_test)
    y_train = torch.tensor(y_train)
    y_test = torch.tensor(y_test)
    
    return X_train, y_train, X_test, y_test

def create_real_dataset(data):
    """
    Returns a train/test split of the real dataset.
    """
    ### Not a guaranteed 50-50 split between sitting and not sitting
    X = interpolation(data.iloc[:,1:27])
    y = interpolation(data[['label:SITTING']])

    ### Converting dataframe to arrays
    X = X.values
    y = y.values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    ### Converting to tensors
    X_train = torch.tensor(X_train)
    X_test = torch.tensor(X_test)
    y_train = torch.tensor(y_train)
    y_test = torch.tensor(y_test)
    
    return X_train, y_train, X_test, y_test
    
def create_dataloader(X_train, y_train, X_test, y_test, batch_size):
    """
    Creates the train_loader and test_loader iterables.
    """
    train_data = torch.utils.data.TensorDataset(X_train, y_train)
    test_data = torch.utils.data.TensorDataset(X_test, y_test)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True) 
    test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size, shuffle = True)
    return train_loader, test_loader

def train_model_fake(gen, criterion, n_epochs, z_dim, data, batch_size, input_size, output_size):
    """
    Trains a classifier on a combination of real and fake training examples. Evaluates it on real testing examples.
    """
    ### Create a training/test set with fake features
    X_train, y_train, X_test, y_test = create_fake_dataset(gen, z_dim, data)
    train_loader, test_loader = create_dataloader(X_train, y_train, X_test, y_test, batch_size)
    
    ### Instantiate the model and optimizer
    model = Classifier(input_size, output_size)
    optimizer = optim.Adam(model.parameters(), lr = 0.0001)
    
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            features, labels = batch
            y_preds = model(features.float())
            loss = criterion(y_preds, labels.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        print(f'Epoch: {epoch + 1} | Total Batch Loss: {total_loss}')
        
    ### Evalute Model's Performance
    evaluate_model(X_test, y_test, test_loader, model)

def train_model_real(criterion, n_epochs, data, batch_size):
    """
    Trains and evaluates a classifier on only real training and testing examples.
    """
    X_train, y_train, X_test, y_test = create_real_dataset(data)
    train_loader, test_loader = create_dataloader(X_train, y_train, X_test, y_test, batch_size)
    
    model = Classifier(input_size, output_size)
    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            features, labels = batch
            y_preds = model(features.float())
            loss = criterion(y_preds, labels.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        print(f'Epoch: {epoch + 1} | Total Batch Loss: {total_loss}')
        
    ### Evalute Model's Performance
    evaluate_model(X_test, y_test, test_loader, model)

def evaluate_model(X_test, y_test, test_loader, model):
    """
    Returns the classification accuracy, precision, recall, and F-1 score of a model.
    """
    total_wrong = 0
    positive_preds = 0 
    true_positives = 0
    false_negatives = 0

    with torch.no_grad():
        for X_test_data, y_test in test_loader:
            y_test_preds = model(X_test_data.float())
            y_test_preds = torch.round(y_test_preds)

            for k in range(len(y_test_preds)):
                if y_test_preds[k].item() == 1:
                    positive_preds += 1
                if y_test_preds[k].item() == y_test[k].item() == 1:
                    true_positives += 1
                if y_test_preds[k].item() == 0 and y_test[k].item() == 1:
                    false_negatives += 1

            current_wrong = (abs(y_test_preds - y_test)).sum().item()
            total_wrong += current_wrong

        class_acc = (len(X_test) - total_wrong) / len(X_test) * 100
        precision = true_positives / positive_preds
        recall = true_positives / (true_positives + false_negatives)

        print(f'Classification Accuracy: {class_acc:.2f}')
        print(f'Precision: {precision:.2f}') #What percentage of a model's positive predictions were actually positive
        print(f'Recall: {recall:.2f}') #What percent of the true positives were identified
        print(f'F-1 Score: {2*(precision * recall / (precision + recall)):.2f}')

# Hyperparameters

In [106]:
criterion = nn.BCEWithLogitsLoss() ### For GAN training
loss_function = nn.BCELoss() ### For classifier training

train_data = torch.utils.data.TensorDataset(train_features, train_labels)
train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True) ### 66 batches

batch_size = 60
n_epochs = 20
z_dim = 100
lr = 0.000001

### For Classifier
input_size = 26
output_size = 1

### Instantiating discriminator/generator objects
disc = Discriminator()
gen = Generator(z_dim)

### Optimizers for the GAN
opt_disc = optim.Adam(disc.parameters(), lr = lr)
opt_gen = optim.Adam(gen.parameters(), lr = lr)

# GAN training loop

In [107]:
for epoch in range(n_epochs):
    for batch_idx, (real_features, _) in enumerate(train_loader):
        batch_size = len(real_features)
        
        ### Training Discriminator
        for k in range(5):
            opt_disc.zero_grad()
            disc_loss = get_disc_loss(gen, disc, criterion, real_features.float(), batch_size, z_dim)
            disc_loss.backward(retain_graph = True)
            opt_disc.step()

        ### Training Generator
        opt_gen.zero_grad()
        gen_loss = get_gen_loss(gen, disc, criterion, batch_size, z_dim)
        gen_loss.backward()
        opt_gen.step()
        
        if batch_idx == 0:
            print(
                f'Epoch [{epoch + 1} / {n_epochs}] Loss D: {disc_loss.item():.4f}, Loss G: {gen_loss.item():.4f} '
            )
            
    if (epoch + 1) % 10 == 0:
        train_model_fake(gen, loss_function, n_epochs, z_dim, data, batch_size, input_size, output_size)
        train_model_real(loss_function, n_epochs, data, batch_size)

Epoch [1 / 20] Loss D: 0.6921, Loss G: 0.6810 
Epoch [2 / 20] Loss D: 0.6928, Loss G: 0.6811 
Epoch [3 / 20] Loss D: 0.6900, Loss G: 0.6976 
Epoch [4 / 20] Loss D: 0.6948, Loss G: 0.6827 
Epoch [5 / 20] Loss D: 0.6899, Loss G: 0.6896 
Epoch [6 / 20] Loss D: 0.6921, Loss G: 0.6880 
Epoch [7 / 20] Loss D: 0.6872, Loss G: 0.6933 
Epoch [8 / 20] Loss D: 0.6894, Loss G: 0.6890 
Epoch [9 / 20] Loss D: 0.6850, Loss G: 0.6981 
Epoch [10 / 20] Loss D: 0.6835, Loss G: 0.6899 
Epoch: 1 | Total Batch Loss: 36.82591310143471
Epoch: 2 | Total Batch Loss: 22.252367809414864
Epoch: 3 | Total Batch Loss: 11.742404691874981
Epoch: 4 | Total Batch Loss: 5.683568447828293
Epoch: 5 | Total Batch Loss: 2.8281019795686007
Epoch: 6 | Total Batch Loss: 1.5518566872924566
Epoch: 7 | Total Batch Loss: 0.9347840175032616
Epoch: 8 | Total Batch Loss: 0.5986886224709451
Epoch: 9 | Total Batch Loss: 0.40491782885510474
Epoch: 10 | Total Batch Loss: 0.29497301089577377
Epoch: 11 | Total Batch Loss: 0.2215801632264629