In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
sub_features = ['58 tGravityAcc-energy()-Y', '59 tGravityAcc-energy()-Z', '104 tBodyAccJerk-entropy()-Y', '125 tBodyGyro-std()-Y',
 '128 tBodyGyro-mad()-Y', '132 tBodyGyro-max()-Z', '134 tBodyGyro-min()-Y','138 tBodyGyro-energy()-Y', '141 tBodyGyro-iqr()-Y',
 '167 tBodyGyroJerk-mad()-X','168 tBodyGyroJerk-mad()-Y','177 tBodyGyroJerk-energy()-X', '181 tBodyGyroJerk-iqr()-Y',
 '475 fBodyGyro-bandsEnergy()-1,8', '484 fBodyGyro-bandsEnergy()-17,32','487 fBodyGyro-bandsEnergy()-1,24']

act_features = ['4 tBodyAcc-std()-X', '7 tBodyAcc-mad()-X', '10 tBodyAcc-max()-X', '17 tBodyAcc-energy()-X', '202 tBodyAccMag-std()',
 '204 tBodyAccMag-max()', '215 tGravityAccMag-std()', '217 tGravityAccMag-max()', '269 fBodyAcc-std()-X', '275 fBodyAcc-max()-X',
 '282 fBodyAcc-energy()-X', '286 fBodyAcc-iqr()-Y', '303 fBodyAcc-bandsEnergy()-1,8', '315 fBodyAcc-bandsEnergy()-1,24',
 '368 fBodyAccJerk-entropy()-Y', '390 fBodyAccJerk-bandsEnergy()-1,16']

In [3]:
#label is a list of integers specifying which labels to filter by
#users is a list of integers specifying which users to filter by
#y_label is a string, either "Activity" or "Subject" depending on what y output needs to be returned
def start_data(label, users, y_label, sub_features, act_features):
    #get the dataframe column names
    name_dataframe = pd.read_csv('../data/features.txt', delimiter = '\n', header = None)
    names = name_dataframe.values.tolist()
    names = [k for row in names for k in row] #List of column names

    data = pd.read_csv('../data/X_train.txt', delim_whitespace = True, header = None) #Read in dataframe
    data.columns = names #Setting column names

    #X_train = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z'] #Selecting only acceleration columns
    
    #X_train_1 = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z']
    #X_train_2 = data.loc[:,'81 tBodyAccJerk-mean()-X':'160 tBodyGyro-correlation()-Y,Z']
    X_train_1 = data[sub_features]
    X_train_2 = data[act_features]
    X_train = pd.concat([X_train_1, X_train_2], axis = 1)
    
    y_train_activity = pd.read_csv('../data/y_train.txt', header = None)
    y_train_activity.columns = ['Activity']
    
    y_train_subject = pd.read_csv('../data/subject_train.txt', header = None)
    y_train_subject.columns = ['Subject']

    GAN_data = pd.concat([X_train, y_train_activity, y_train_subject], axis = 1)
    GAN_data = GAN_data[GAN_data['Activity'].isin(label)]
    GAN_data = GAN_data[GAN_data['Subject'].isin(users)]
    
    X = GAN_data.iloc[:,:-2].values
    #X = GAN_data.loc[:,'1 tBodyAcc-mean()-X':'160 tBodyGyro-correlation()-Y,Z'].values
    y = GAN_data[[y_label]].values
    
    return X, y

In [4]:
#defines each generator layer
#input and output dimensions needed
def generator_block(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.Dropout(0.1),
        nn.BatchNorm1d(output_dim),
        nn.ReLU(inplace = True)
    )

#returns n_samples of z_dim (number of dimensions of latent space) noise
def get_noise(n_samples, z_dim):
    return torch.randn(n_samples, z_dim)

#defines generator class
class Generator(nn.Module):
    def __init__(self, z_dim = 10, feature_dim = 32, hidden_dim = 128):
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            generator_block(z_dim, 80),
            generator_block(80, 60),
            generator_block(60, 40),
            nn.Linear(40, feature_dim)
        )
    def forward(self, noise):
        return self.gen(noise)

def get_act_matrix(batch_size, a_dim):
    indexes = np.random.randint(a_dim, size = batch_size)
    
    one_hot = np.zeros((len(indexes), indexes.max()+1))
    one_hot[np.arange(len(indexes)),indexes] = 1
    return torch.Tensor(indexes).long(), torch.Tensor(one_hot)
    
def get_usr_matrix(batch_size, u_dim):
    indexes = np.random.randint(u_dim, size = batch_size)
    
    one_hot = np.zeros((indexes.size, indexes.max()+1))
    one_hot[np.arange(indexes.size),indexes] = 1
    return torch.Tensor(indexes).long(), torch.Tensor(one_hot)

def load_model(model, model_name):
    model.load_state_dict(torch.load(f'../saved_models/{model_name}'))

In [5]:
gen = Generator(z_dim = 106)
load_model(gen, "cGAN_UCI_30k_TEST_gen.param")

# Train On Real, Test On Real

In [6]:
activities = [1, 3, 4]
users = [1, 3, 5]
X, y = start_data(activities, users, "Activity", sub_features, act_features)

In [7]:
#Transforming activity labels (1, 3, 4 --> 0, 1, 2)
for k in range(len(y)): 
    if y[k] == 1:
        y[k] = 0
    elif y[k] == 3:
        y[k] = 1
    else:
        y[k] = 2

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size = 0.2, shuffle = True)

classifier_real = LogisticRegression(penalty = 'l2', C = 0.7)
classifier_real.fit(X_train, y_train)

y_pred = classifier_real.predict(X_test)
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

              precision    recall  f1-score   support

           0      1.000     0.977     0.989        44
           1      0.969     1.000     0.984        31
           2      1.000     1.000     1.000        25

    accuracy                          0.990       100
   macro avg      0.990     0.992     0.991       100
weighted avg      0.990     0.990     0.990       100



# Train On Fake, Test On Real

In [9]:
latent_vectors = get_noise(len(X_train), 100)
act_vectors = get_act_matrix(len(X_train), 3)
usr_vectors = get_usr_matrix(len(X_train), 3)

to_gen = torch.cat((latent_vectors, act_vectors[1], usr_vectors[1]), 1)
fake_features = gen(to_gen).detach().numpy()

classifier_fake = LogisticRegression(penalty = 'l2', C = 0.7)
classifier_fake.fit(fake_features, act_vectors[0])

y_pred = classifier_fake.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

[[43  1  0]
 [ 1 30  0]
 [ 0  0 25]]
              precision    recall  f1-score   support

           0      0.977     0.977     0.977        44
           1      0.968     0.968     0.968        31
           2      1.000     1.000     1.000        25

    accuracy                          0.980       100
   macro avg      0.982     0.982     0.982       100
weighted avg      0.980     0.980     0.980       100



# Subject Test

In [10]:
X, y = start_data(activities, users, "Subject", sub_features, act_features)

In [11]:
for k in range(len(y)): 
    if y[k] == 1:
        y[k] = 0
    elif y[k] == 3:
        y[k] = 1
    else:
        y[k] = 2

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size = 0.2, shuffle = True)

classifier_real = LogisticRegression(penalty = 'l2', C = 0.7, max_iter = 300)
classifier_real.fit(X_train, y_train)

y_pred = classifier_real.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

[[41  1  1]
 [ 3 19  9]
 [ 0  3 23]]
              precision    recall  f1-score   support

           0      0.932     0.953     0.943        43
           1      0.826     0.613     0.704        31
           2      0.697     0.885     0.780        26

    accuracy                          0.830       100
   macro avg      0.818     0.817     0.809       100
weighted avg      0.838     0.830     0.826       100



In [56]:
latent_vectors = get_noise(len(X_train), 100)
act_vectors = get_act_matrix(len(X_train), 3)
usr_vectors = get_usr_matrix(len(X_train), 3)

to_gen = torch.cat((latent_vectors, act_vectors[1], usr_vectors[1]), 1)
fake_features = gen(to_gen).detach().numpy()

classifier_fake = LogisticRegression(penalty = 'l2', C = 0.7, max_iter = 300)
classifier_fake.fit(fake_features, usr_vectors[0])

y_pred = classifier_fake.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits = 3)) 

[[32  0  9]
 [ 2 16 10]
 [ 1  1 29]]
              precision    recall  f1-score   support

           0      0.914     0.780     0.842        41
           1      0.941     0.571     0.711        28
           2      0.604     0.935     0.734        31

    accuracy                          0.770       100
   macro avg      0.820     0.762     0.762       100
weighted avg      0.826     0.770     0.772       100



In [55]:
# from sklearn.datasets import make_classification
# from sklearn.ensemble import RandomForestClassifier

In [15]:
# name_dataframe = pd.read_csv('../data/features.txt', delimiter = '\n', header = None)
# names = name_dataframe.values.tolist()
# names = [k for row in names for k in row] #List of column names

# data = pd.read_csv('../data/X_train.txt', delim_whitespace = True, header = None) #Read in dataframe
# data.columns = names #Setting column names

# #X_train = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z'] #Selecting only acceleration columns

# X_train = data

# y_train_activity = pd.read_csv('../data/y_train.txt', header = None)
# y_train_activity.columns = ['Activity']

# y_train_subject = pd.read_csv('../data/subject_train.txt', header = None)
# y_train_subject.columns = ['Subject']

# GAN_data = pd.concat([X_train, y_train_activity, y_train_subject], axis = 1)
# GAN_data = GAN_data[GAN_data['Activity'].isin(activities)]
# GAN_data = GAN_data[GAN_data['Subject'].isin(users)]

In [16]:
# X_train = GAN_data.iloc[:,:-2].values
# y_train = GAN_data.iloc[:,-2].values

In [17]:
# for k in range(len(y_train)):
#     if y_train[k] == 1:
#         y_train[k] = 0
#     elif y_train[k] == 3:
#         y_train[k] = 1
#     else:
#         y_train[k] = 2

In [18]:
# from matplotlib import pyplot

# model = RandomForestClassifier()
# # fit the model
# model.fit(X_train, y_train)
# importance = model.feature_importances_
# # summarize feature importance
# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

In [19]:
# sub_features = ['58 tGravityAcc-energy()-Y', '59 tGravityAcc-energy()-Z', '104 tBodyAccJerk-entropy()-Y', '125 tBodyGyro-std()-Y',
#  '128 tBodyGyro-mad()-Y', '132 tBodyGyro-max()-Z', '134 tBodyGyro-min()-Y','138 tBodyGyro-energy()-Y', '141 tBodyGyro-iqr()-Y',
#  '167 tBodyGyroJerk-mad()-X','168 tBodyGyroJerk-mad()-Y','177 tBodyGyroJerk-energy()-X', '181 tBodyGyroJerk-iqr()-Y',
#  '475 fBodyGyro-bandsEnergy()-1,8', '484 fBodyGyro-bandsEnergy()-17,32','487 fBodyGyro-bandsEnergy()-1,24']

# act_features = ['4 tBodyAcc-std()-X', '7 tBodyAcc-mad()-X', '10 tBodyAcc-max()-X', '17 tBodyAcc-energy()-X', '202 tBodyAccMag-std()',
#  '204 tBodyAccMag-max()', '215 tGravityAccMag-std()', '217 tGravityAccMag-max()', '269 fBodyAcc-std()-X', '275 fBodyAcc-max()-X',
#  '282 fBodyAcc-energy()-X', '286 fBodyAcc-iqr()-Y', '303 fBodyAcc-bandsEnergy()-1,8', '315 fBodyAcc-bandsEnergy()-1,24',
#  '368 fBodyAccJerk-entropy()-Y', '390 fBodyAccJerk-bandsEnergy()-1,16']