In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.over_sampling import SMOTE

In [2]:
def start_data(label, users):
    #get the dataframe column names
    name_dataframe = pd.read_csv('../data/features.txt', delimiter = '\n', header = None)
    names = name_dataframe.values.tolist()
    names = [k for row in names for k in row] #List of column names

    data = pd.read_csv('../data/X_train.txt', delim_whitespace = True, header = None) #Read in dataframe
    data.columns = names #Setting column names

    #X_train = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z'] #Selecting only acceleration columns
    
    X_train_1 = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z']
    X_train_2 = data.loc[:,'81 tBodyAccJerk-mean()-X':'120 tBodyAccJerk-correlation()-Y,Z']
    X_train = pd.concat([X_train_1, X_train_2], axis = 1)

    y_train_activity = pd.read_csv('../data/y_train.txt', header = None)
    y_train_activity.columns = ['Activity']

    y_train_subject = pd.read_csv('../data/subject_train.txt', header = None)
    y_train_subject.columns = ['Subject']

    GAN_data = pd.concat([X_train, y_train_activity, y_train_subject], axis = 1)
    GAN_data = GAN_data[GAN_data['Activity'].isin(label)]
    GAN_data = GAN_data[GAN_data['Subject'].isin(users)]
    
    X = GAN_data.loc[:,'1 tBodyAcc-mean()-X':'120 tBodyAccJerk-correlation()-Y,Z'].values
    y = GAN_data[['Activity']].values
    
    return X, y

In [3]:
#defines each generator layer
#input and output dimensions needed
def generator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.Dropout(0.1),
        nn.BatchNorm1d(output_dim),
        nn.ReLU(inplace = True)
    )

#returns n_samples of z_dim (number of dimensions of latent space) noise
def get_noise(n_samples, z_dim):
    return torch.randn(n_samples, z_dim)

#defines generator class
class Generator(nn.Module):
    def __init__(self, z_dim = 10, feature_dim = 80, hidden_dim = 128):
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            generator_block(z_dim, 95),
            generator_block(95, 90),
            generator_block(90, 85),
            nn.Linear(85, feature_dim)
        )
    def forward(self, noise):
        return self.gen(noise)
    
def get_act_matrix(batch_size, a_dim):
    indexes = np.random.randint(a_dim, size = batch_size)
    
    one_hot = np.zeros((len(indexes), indexes.max()+1))
    one_hot[np.arange(len(indexes)),indexes] = 1
    return torch.Tensor(indexes).long(), torch.Tensor(one_hot)
    
def get_usr_matrix(batch_size, u_dim):
    indexes = np.random.randint(u_dim, size = batch_size)
    
    one_hot = np.zeros((indexes.size, indexes.max()+1))
    one_hot[np.arange(indexes.size),indexes] = 1
    return torch.Tensor(indexes).long(), torch.Tensor(one_hot)

def load_model(model, model_name):
    model.load_state_dict(torch.load(f'../saved_models/{model_name}'))

In [4]:
activities = [1, 3, 4]
users = [1, 3, 5]
X, y = start_data(activities, users)

gen = Generator(z_dim = 106)
load_model(gen, "cGAN_UCI_gen.param")

# Train on real, test on real

In [5]:
#Transforming activity labels (1, 3, 4 --> 0, 1, 2)
for k in range(len(y)): 
    if y[k] == 1:
        y[k] = 0
    elif y[k] == 3:
        y[k] = 1
    else:
        y[k] = 2

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size = 0.2, shuffle = True)

classifier_real = LogisticRegression(penalty = 'l2', C = 0.7)
classifier_real.fit(X_train, y_train)

y_pred = classifier_real.predict(X_test)
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        43
           1      1.000     1.000     1.000        30
           2      1.000     1.000     1.000        27

    accuracy                          1.000       100
   macro avg      1.000     1.000     1.000       100
weighted avg      1.000     1.000     1.000       100



# Train on fake, test on real

In [7]:
latent_vectors = get_noise(len(X_train), 100)
act_vectors = get_act_matrix(len(X_train), 3)
usr_vectors = get_usr_matrix(len(X_train), 3)

to_gen = torch.cat((latent_vectors, act_vectors[1], usr_vectors[1]), 1)
fake_features = gen(to_gen).detach().numpy()

classifier_fake = LogisticRegression(penalty = 'l2', C = 0.7)
classifier_fake.fit(fake_features, act_vectors[0])

y_pred = classifier_fake.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

[[38  5  0]
 [ 1 29  0]
 [ 0  0 27]]
              precision    recall  f1-score   support

           0      0.974     0.884     0.927        43
           1      0.853     0.967     0.906        30
           2      1.000     1.000     1.000        27

    accuracy                          0.940       100
   macro avg      0.942     0.950     0.944       100
weighted avg      0.945     0.940     0.940       100



# Train on real + fake, test on real

In [106]:
X, y = start_data(activities, users)

In [107]:
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X, y, test_size = 0.2, shuffle = True)

y_real = np.ones(len(X_train_real))
y_fake = np.zeros(len(fake_features))

In [108]:
X_train_combo = np.concatenate((fake_features, X_train_real), axis = 0)
y_train_combo = np.concatenate((y_fake, y_real), axis = 0)

In [109]:
def shuffle(X, y):
    data = np.concatenate((X, y.reshape(-1,1)), axis = 1)
    np.random.shuffle(data)
    return data[:,:-1], data[:,-1]

In [110]:
X_train_combo, y_train_combo = shuffle(X_train_combo, y_train_combo)

In [113]:
classifier = LogisticRegression(penalty = 'l2', C = 0.7)
classifier.fit(X_train_combo, y_train_combo)
y_pred = classifier.predict(X_test_real)

y_test_real = np.ones(len(y_test_real))

print(metrics.confusion_matrix(y_test_real, y_pred))
print(metrics.classification_report(y_test_real, y_pred, digits = 3))     

[[100]]
              precision    recall  f1-score   support

         1.0      1.000     1.000     1.000       100

    accuracy                          1.000       100
   macro avg      1.000     1.000     1.000       100
weighted avg      1.000     1.000     1.000       100

