In [2]:
%matplotlib inline

import sys

sys.path.append("..")

# Discriminator

I will first try the discriminator architecture on a generated, easy to classify data set. Two classes are generated, with 100-dimensional feature vectors (x). Feature vectors for the first class are randomnly generated using a uniform distribuition, while feature vectors for the second class are generated by adding $1.0$ to a randomly generated number from a uniform distribuition.

I will first see the impact of using one-hot encodings as labels.

In [25]:
# Decide which device we want to run on

ngpu = 1

device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

### Binary classes

For the binary scenario, we will use the Binary Cross-Entropy loss function. Output is a single float. If closer to 0, the predicted class will be class 0.

In [4]:
# Load sample vectors directly from pickle

import os, pickle

from settings import ROOT_DIR

data = pickle.load(open(os.path.join(ROOT_DIR, 'static', 'pickles', 'discriminators', 'sample-vectors.pickle'), 'rb'))
data.head()

Unnamed: 0,class,vector
0,1,"[1.982117870852664, 1.5483193055178424, 1.8743..."
1,1,"[1.418715409949313, 1.7365551148618708, 1.8584..."
2,0,"[0.3888934328828856, 0.4883907022483611, 0.519..."
3,0,"[0.5702234190321228, 0.411709502422555, 0.1705..."
4,1,"[1.1901701568991827, 1.3838762314392503, 1.304..."


In [5]:
# train/test split

test_index = int(len(data) * 0.8)

train = data[:test_index]
test = data[test_index:]

print('length: %s class 0: %s %% class 1: %s %%' % (len(train), sum(train['class']) / len(train) * 100, (len(train) - sum(train['class'])) / len(train) * 100))
print('length: %s class 0: %s %% class 1: %s %%' % (len(test), sum(test['class']) / len(test) * 100, (len(test) - sum(test['class'])) / len(test) * 100))

length: 80000 class 0: 50.286249999999995 % class 1: 49.713750000000005 %
length: 20000 class 0: 50.78 % class 1: 49.220000000000006 %


In [27]:
# Create our data tensors

import torch

import numpy as np

x_train = torch.tensor(np.vstack(train['vector']), dtype=torch.float, device=device)
y_train = torch.tensor(np.vstack(train['class']), dtype=torch.float, device=device)
x_test = torch.tensor(np.vstack(test['vector']), dtype=torch.float, device=device)
y_test = torch.tensor(np.vstack(test['class']), dtype=torch.float, device=device)

In [21]:
# Model definition

input_dimension = len(data['vector'][0])

class Discriminator(torch.nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = torch.nn.Sequential(
            # state size. input_dimension
            torch.nn.Linear(input_dimension, 75),
            # state size. 75
            torch.nn.ReLU(),
            torch.nn.Linear(75, 25),
            # state size. 25
            torch.nn.ReLU(),
            torch.nn.Linear(25, 1),
            torch.nn.Sigmoid() # 0 < output_value < 1
        )

    def forward(self, input):
        return self.main(input)

In [22]:
loss_fn = torch.nn.BCELoss()

learning_rate = 1e-1

net = Discriminator(ngpu=ngpu)
net.to(device)

print(net)

Discriminator(
  (main): Sequential(
    (0): Linear(in_features=100, out_features=75, bias=True)
    (1): ReLU()
    (2): Linear(in_features=75, out_features=25, bias=True)
    (3): ReLU()
    (4): Linear(in_features=25, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


In [23]:
# simple accuracy metric: (TP + TN) / (TP + TN + FP + FN)

def accuracy():
    y_pred = net(x_test)
    predicted_classes = y_pred > 0.5
    return (predicted_classes.int() == y_test.int()).sum().float() / float(len(predicted_classes))

In [28]:
# training

epoch_size = 10
epochs = 10

max_accuracy = 0

for epoch in range(epochs):
    for t in range(epoch_size):
        y_pred = net(x_train)
        loss = loss_fn(y_pred, y_train)
        net.zero_grad()
        loss.backward()
        with torch.no_grad():
            for param in net.parameters():
                param.data -= learning_rate * param.grad
    current_accuracy = accuracy()
    if current_accuracy < max_accuracy:
        break
    else:
        print('epoch: %s t: %s loss: %s accuracy: %s' % (epoch, epoch * epoch_size, loss.item(), current_accuracy))
        max_accuracy = current_accuracy

epoch: 0 t: 0 loss: 0.6231658458709717 accuracy: tensor(0.5078, device='cuda:0')
epoch: 1 t: 10 loss: 0.5571004748344421 accuracy: tensor(0.5078, device='cuda:0')
epoch: 2 t: 20 loss: 0.49954453110694885 accuracy: tensor(0.5119, device='cuda:0')
epoch: 3 t: 30 loss: 0.4389975070953369 accuracy: tensor(0.8087, device='cuda:0')
epoch: 4 t: 40 loss: 0.3772467374801636 accuracy: tensor(0.9963, device='cuda:0')
epoch: 5 t: 50 loss: 0.31594687700271606 accuracy: tensor(1., device='cuda:0')
epoch: 6 t: 60 loss: 0.2574044466018677 accuracy: tensor(1., device='cuda:0')
epoch: 7 t: 70 loss: 0.20526215434074402 accuracy: tensor(1., device='cuda:0')
epoch: 8 t: 80 loss: 0.1615338921546936 accuracy: tensor(1., device='cuda:0')
epoch: 9 t: 90 loss: 0.1266518235206604 accuracy: tensor(1., device='cuda:0')


### One-hot encoded

For this scenario, we will use the Cross-Entropy loss function. Output is a float tuple with the probability for each class.

In [29]:
# Load sample vectors directly from pickle

import os, pickle

from settings import ROOT_DIR

data_onehot = pickle.load(open(os.path.join(ROOT_DIR, 'static', 'pickles', 'discriminators', 'sample-vectors-onehot.pickle'), 'rb'))
data_onehot.head()

Unnamed: 0,class,vector
0,"[0, 1]","[1.982117870852664, 1.5483193055178424, 1.8743..."
1,"[0, 1]","[1.418715409949313, 1.7365551148618708, 1.8584..."
2,"[1, 0]","[0.3888934328828856, 0.4883907022483611, 0.519..."
3,"[1, 0]","[0.5702234190321228, 0.411709502422555, 0.1705..."
4,"[0, 1]","[1.1901701568991827, 1.3838762314392503, 1.304..."


In [30]:
# train/test split

test_index_onehot = int(len(data_onehot) * 0.8)

train_onehot = data_onehot[:test_index_onehot]
test_onehot = data_onehot[test_index_onehot:]

print('length: %s class 0: %s %% class 1: %s %%' % (len(train_onehot), sum(train_onehot['class'].apply(lambda x: x[0])) / len(train_onehot) * 100, (len(train_onehot) - sum(train_onehot['class'].apply(lambda x: x[0]))) / len(train_onehot) * 100))
print('length: %s class 0: %s %% class 1: %s %%' % (len(test_onehot), sum(test_onehot['class'].apply(lambda x: x[0])) / len(test_onehot) * 100, (len(test_onehot) - sum(test_onehot['class'].apply(lambda x: x[0]))) / len(test_onehot) * 100))

length: 80000 class 0: 49.713750000000005 % class 1: 50.286249999999995 %
length: 20000 class 0: 49.220000000000006 % class 1: 50.78 %


In [32]:
# Create our data tensors

import numpy as np

x_train_onehot = torch.tensor(np.vstack(train_onehot['vector']), dtype=torch.float, device=device)
y_train_onehot = torch.tensor(np.vstack(train_onehot['class']), dtype=torch.float, device=device)
x_test_onehot = torch.tensor(np.vstack(test_onehot['vector']), dtype=torch.float, device=device)
y_test_onehot = torch.tensor(np.vstack(test_onehot['class']), dtype=torch.float, device=device)

In [36]:
class DiscriminatorOnehot(torch.nn.Module):
    def __init__(self, ngpu):
        super(DiscriminatorOnehot, self).__init__()
        self.ngpu = ngpu
        self.main = torch.nn.Sequential(
            # state size. input_dimension
            torch.nn.Linear(input_dimension, 75),
            # state size. 75
            torch.nn.ReLU(),
            torch.nn.Linear(75, 25),
            # state size. 25
            torch.nn.ReLU(),
            torch.nn.Linear(25, 2), # binary one-hot encoding vector
            torch.nn.Softmax(dim=1) # values must sum 1 on dimension 1, that is for the two dimensional outputs for each example
        )

    def forward(self, input):
        return self.main(input)

In [40]:
loss_fn = torch.nn.CrossEntropyLoss()

learning_rate = 1e-1

net = DiscriminatorOnehot(ngpu=0)
net.to(device)

print(net)

DiscriminatorOnehot(
  (main): Sequential(
    (0): Linear(in_features=100, out_features=75, bias=True)
    (1): ReLU()
    (2): Linear(in_features=75, out_features=25, bias=True)
    (3): ReLU()
    (4): Linear(in_features=25, out_features=2, bias=True)
    (5): Softmax()
  )
)


In [41]:
# torch.nn.CrossEntropyLoss takes the output vectors (a float for each class indicating the probability of belonging to that class - only one class per example) and a single target int 
# that represents the class label. Thus, we take the argmax of the one-hot encoded vectors to get a class id.

def accuracy_onehot():
    y_pred = net(x_test_onehot)
    predicted_classes = y_pred.argmax(1)
    return (predicted_classes == y_test_onehot.argmax(1)).sum().float() / float(len(predicted_classes))

In [42]:
epoch_size = 10
epochs = 10

max_accuracy = 0

for epoch in range(epochs):
    for t in range(epoch_size):
        y_pred = net(x_train_onehot)
        loss = loss_fn(y_pred, y_train_onehot.argmax(1))
        net.zero_grad()
        loss.backward()
        with torch.no_grad():
            for param in net.parameters():
                param.data -= learning_rate * param.grad
    current_accuracy = accuracy_onehot()
    if current_accuracy < max_accuracy:
        break
    else:
        print('epoch: %s t: %s loss: %s accuracy: %s' % (epoch, epoch * epoch_size, loss.item(), current_accuracy))
        max_accuracy = current_accuracy

epoch: 0 t: 0 loss: 0.614982545375824 accuracy: tensor(0.5078, device='cuda:0')
epoch: 1 t: 10 loss: 0.5875807404518127 accuracy: tensor(0.5078, device='cuda:0')
epoch: 2 t: 20 loss: 0.5639144778251648 accuracy: tensor(0.5616, device='cuda:0')
epoch: 3 t: 30 loss: 0.539263904094696 accuracy: tensor(0.9245, device='cuda:0')
epoch: 4 t: 40 loss: 0.5145022869110107 accuracy: tensor(0.9982, device='cuda:0')
epoch: 5 t: 50 loss: 0.4899929165840149 accuracy: tensor(1., device='cuda:0')
epoch: 6 t: 60 loss: 0.4663316309452057 accuracy: tensor(1., device='cuda:0')
epoch: 7 t: 70 loss: 0.4442014992237091 accuracy: tensor(1., device='cuda:0')
epoch: 8 t: 80 loss: 0.4242170751094818 accuracy: tensor(1., device='cuda:0')
epoch: 9 t: 90 loss: 0.4069361686706543 accuracy: tensor(1., device='cuda:0')


The one-hot encoding approach seems working better for this test task, but we should check with the complete architecture.

# Dataset class and package model definition

Let's create a tidy workflow for this classifier.

In [56]:
# Load the Dataset class

import os

from src.datasets.sample_vectors import SampleVectorDataset

pickle_dir = os.path.join(ROOT_DIR, 'static', 'pickles', 'discriminators', 'sample-vectors.pickle')

dataset = SampleVectorDataset(pickle_dir)

In [58]:
dataset[2][0][:10]

[0.3888934328828856,
 0.4883907022483611,
 0.5196878749250333,
 0.383532709936366,
 0.19934792760457365,
 0.4443464931636203,
 0.56018541815734,
 0.9246406702715995,
 0.08085428787325377,
 0.921788140686779]