In [1]:
import sys
sys.path.append('..')

In [2]:
import time
import os
import re

import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from defences.util import get_range, normalize
from pt_dataset import PTDataset

In [3]:
%load_ext autoreload
%autoreload 2

## Load data from CSV file

In [4]:
DATA_INDEX = 3
files = [
    os.path.join('..', 'data', 'banknote_preprocessed.csv'),
    os.path.join('..', 'data', 'htru2_preprocessed.csv'),
    os.path.join('..', 'data', 'segment_preprocessed.csv'),
    os.path.join('..', 'data', 'texture_preprocessed.csv'),
]
file_path = files[DATA_INDEX]
df = pd.read_csv(file_path, sep=',')

print('Data:', df.shape)
df.head()

Data: (5500, 41)


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A32,A33,A34,A35,A36,A37,A38,A39,A40,Class
0,-1.223,-0.798,-0.867,-0.639,-0.545,-0.412,-0.795,-0.629,-0.547,-0.868,...,-0.766,-0.555,-0.714,-0.545,-0.587,-0.871,-0.62,-0.568,-0.607,0
1,-1.41,-1.029,-1.013,-0.895,-0.762,-0.676,-1.043,-0.851,-0.775,-1.037,...,-0.919,-0.77,-0.847,-0.663,-0.723,-1.013,-0.748,-0.698,-0.817,0
2,-1.107,-0.649,-0.629,-0.492,-0.367,-0.298,-0.682,-0.478,-0.395,-0.681,...,-0.692,-0.445,-0.588,-0.371,-0.368,-0.746,-0.457,-0.379,-0.469,0
3,-1.27,-0.855,-0.958,-0.707,-0.619,-0.469,-0.872,-0.705,-0.62,-0.988,...,-0.829,-0.719,-0.774,-0.617,-0.688,-0.937,-0.693,-0.657,-0.779,0
4,-1.331,-0.862,-0.761,-0.689,-0.498,-0.361,-0.857,-0.6,-0.496,-0.779,...,-0.861,-0.571,-0.784,-0.545,-0.562,-0.952,-0.642,-0.578,-0.648,0


In [5]:
y = df['Class'].to_numpy().astype(np.long)
X = df.drop(['Class'], axis=1).to_numpy().astype(np.float32)
n_features = X.shape[1]
n_classes = len(np.unique(y))
print('X:', X.shape)
print('y:', y.shape)
print('features: {}, classes: {}'.format(n_features, n_classes))

X: (5500, 40)
y: (5500,)
features: 40, classes: 11


In [6]:
# Split data
# For Banknote, Yeast, Segment, uses 400 test examples
# For Abalone, Texture, uses 600 test examples
# For htru2, uses 4000 test examples
if re.search(r'(banknote)|(yeast)|(segment)', file_path):
    N_TEST = 400
elif re.search(r'(abalone)|(texture)', file_path):
    N_TEST = 600
else:
    N_TEST = 4000
print('n_test:', N_TEST)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=N_TEST)
print('X_train', X_train.shape)

# Apply scaling
xmin, xmax = get_range(X_train)
X_train = normalize(X_train, xmin, xmax)
X_test = normalize(X_test, xmin, xmax)

n_test: 600
X_train (4900, 40)


## Train a PyTorch Nueral Network classifier

In [7]:
BATCH_SIZE = 128
EPOCHS = 500
if re.search('htru2', file_path):
    EPOCHS = 200

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [9]:
dataset_train = PTDataset(X_train, y_train)
dataset_test = PTDataset(X_test, y_test)

dataloader_train = DataLoader(dataset_train, BATCH_SIZE, shuffle=True, num_workers=0)
dataloader_test = DataLoader(dataset_test, BATCH_SIZE, shuffle=True, num_workers=0)

In [10]:
def train(model, loader, loss, optimizer, ):
    model.train()
    total_loss = 0.
    corrects = 0.
    
    for x, y in loader:
        x = x.to(device)
        y = y.to(device)
        
        batch_size = x.size(0)
        optimizer.zero_grad()
        output = model(x)
        l = loss(output, y)
        l.backward()
        optimizer.step()
        
        # for display
        total_loss += l.item() * batch_size
        preds = output.max(1, keepdim=True)[1]
        corrects += preds.eq(y.view_as(preds)).sum().item()
    
    n = len(loader.dataset)
    total_loss = total_loss / n
    accuracy = corrects / n
    return total_loss, accuracy
    

In [11]:
def validate(model, loader, loss):
    model.eval()
    total_loss = 0.
    corrects = 0.
    
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            batch_size = x.size(0)
            output = model(x)
            l = loss(output, y)
            total_loss += l.item() * batch_size
            preds = output.max(1, keepdim=True)[1]
            corrects += preds.eq(y.view_as(preds)).sum().item()
    
    n = len(loader.dataset)
    total_loss = total_loss / n
    accuracy = corrects / n
    return total_loss, accuracy

In [12]:
model = nn.Sequential(
            nn.Linear(n_features, n_features*4),
            nn.ReLU(),
            nn.Linear(n_features*4, n_features*4),
            nn.ReLU(),
            nn.Linear(n_features*4, n_classes),
            nn.Softmax(dim=1)
        )
model.to(device)

Sequential(
  (0): Linear(in_features=40, out_features=160, bias=True)
  (1): ReLU()
  (2): Linear(in_features=160, out_features=160, bias=True)
  (3): ReLU()
  (4): Linear(in_features=160, out_features=11, bias=True)
  (5): Softmax(dim=1)
)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# Banknote dataset works better on SGD
if re.search('banknote', file_path):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss = torch.nn.CrossEntropyLoss()

In [14]:
since = time.time()
for epoch in range(EPOCHS):
    start = time.time()
    tr_loss, tr_acc = train(model, dataloader_train, loss, optimizer)
    va_loss, va_acc = validate(model, dataloader_test, loss)
    
    time_elapsed = time.time() - start
    if epoch % 10 == 0:
        print(('[{:3d}] {:.0f}m {:.1f}s Train Loss: {:.4f} Accuracy: {:.4f}%, ' +
            'Test Loss: {:.4f} Accuracy: {:.4f}%').format(
                epoch+1, time_elapsed // 60, time_elapsed % 60,
                tr_loss, tr_acc*100.,
                va_loss, va_acc*100.))
    
time_elapsed = time.time() - since
print('Total run time: {:.0f}m {:.1f}s'.format(
    time_elapsed // 60,
    time_elapsed % 60))
    

[  1] 0m 0.1s Train Loss: 2.3966 Accuracy: 14.4082%, Test Loss: 2.3951 Accuracy: 17.8333%
[ 11] 0m 0.1s Train Loss: 2.2243 Accuracy: 36.6327%, Test Loss: 2.2207 Accuracy: 34.1667%
[ 21] 0m 0.1s Train Loss: 1.9856 Accuracy: 64.8367%, Test Loss: 1.9928 Accuracy: 64.0000%
[ 31] 0m 0.1s Train Loss: 1.8870 Accuracy: 74.3265%, Test Loss: 1.9004 Accuracy: 72.6667%
[ 41] 0m 0.1s Train Loss: 1.8263 Accuracy: 79.2857%, Test Loss: 1.8398 Accuracy: 78.1667%
[ 51] 0m 0.1s Train Loss: 1.7802 Accuracy: 81.5306%, Test Loss: 1.7932 Accuracy: 80.6667%
[ 61] 0m 0.1s Train Loss: 1.7521 Accuracy: 83.0612%, Test Loss: 1.7659 Accuracy: 82.3333%
[ 71] 0m 0.1s Train Loss: 1.7336 Accuracy: 84.1429%, Test Loss: 1.7452 Accuracy: 83.5000%
[ 81] 0m 0.1s Train Loss: 1.7189 Accuracy: 85.2449%, Test Loss: 1.7307 Accuracy: 84.5000%
[ 91] 0m 0.1s Train Loss: 1.7074 Accuracy: 86.0612%, Test Loss: 1.7198 Accuracy: 84.8333%
[101] 0m 0.1s Train Loss: 1.6983 Accuracy: 86.5306%, Test Loss: 1.7118 Accuracy: 84.8333%
[111] 0m 0

In [15]:
from sklearn.svm import SVC

In [16]:
# Test on SVM
model2 = SVC()
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

0.9933333333333333

In [17]:
nn.Sequential(*list(model.children()))

Sequential(
  (0): Linear(in_features=40, out_features=160, bias=True)
  (1): ReLU()
  (2): Linear(in_features=160, out_features=160, bias=True)
  (3): ReLU()
  (4): Linear(in_features=160, out_features=11, bias=True)
  (5): Softmax(dim=1)
)

In [18]:
nn.Sequential(*list(model.children()))

Sequential(
  (0): Linear(in_features=40, out_features=160, bias=True)
  (1): ReLU()
  (2): Linear(in_features=160, out_features=160, bias=True)
  (3): ReLU()
  (4): Linear(in_features=160, out_features=11, bias=True)
  (5): Softmax(dim=1)
)

In [19]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(n_features, n_features*4)
        self.fc2 = nn.Linear(n_features*4, n_features*4)
        self.fc3 = nn.Linear(n_features*4, n_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

In [20]:
model3 = Classifier()
model3

Classifier(
  (fc1): Linear(in_features=40, out_features=160, bias=True)
  (fc2): Linear(in_features=160, out_features=160, bias=True)
  (fc3): Linear(in_features=160, out_features=11, bias=True)
)

In [21]:
list(model3.children())

[Linear(in_features=40, out_features=160, bias=True),
 Linear(in_features=160, out_features=160, bias=True),
 Linear(in_features=160, out_features=11, bias=True)]