In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from tqdm.notebook import tqdm
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score
import scipy
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

torch.manual_seed(42)

<torch._C.Generator at 0x7fda8c90cdd0>

In [2]:
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')

## Preprocessing data

In [3]:
train_size = train_df.shape[0]
dev_size = dev_df.shape[0]

In [4]:
train_size, dev_size

(483, 207)

In [5]:
df = pd.concat([train_df, dev_df])

In [6]:
embedding_idx_data = {'A1': {10: 0, 11: 1},
                'A4': {40: 2, 41: 3, 42: 4, 43: 5},
                'A5': {50: 8, 51: 7, 52: 6},
                'A6': {600: 9, 601: 10, 602: 11, 603: 12, 604: 13, 605: 14, 606: 15, 607: 16, 608: 17, 609: 18, 610: 19, 611: 20, 612: 21, 613: 22},
                'A7': {70: 23, 71: 24, 72: 25, 73: 26, 74: 27, 75: 28, 76: 29, 77: 30, 78: 31},
                'A9': {90: 32, 91: 33},
                'A10': {100: 34, 101: 35},
                'A12': {120: 36, 121: 37},
                'A13': {130: 38, 131: 39, 132: 40}
               }

In [7]:
df = df.replace(embedding_idx_data)

In [8]:
df['A16'] = df['A16'].replace({-1: 0})

In [9]:
# One hot encoding 
df = df[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', # Categorical
   'A2', 'A3', 'A8', 'A11', 'A14', 'A15', # Numerical
    'A16'
   ]]

In [10]:
df.head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A2,A3,A8,A11,A14,A15,A16
0,0.0,3.0,7.0,14.0,23.0,32.0,34.0,37.0,38.0,21.67,1.165,2.5,1.0,180.0,20.0,0.0
1,0.0,3.0,7.0,18.0,23.0,32.0,34.0,36.0,38.0,23.58,0.46,2.625,6.0,208.0,347.0,0.0
2,1.0,2.0,8.0,9.0,23.0,32.0,34.0,36.0,38.0,47.75,8.0,7.875,6.0,0.0,1260.0,1.0
3,0.0,2.0,8.0,9.0,23.0,32.0,35.0,37.0,38.0,31.42,15.5,0.5,0.0,120.0,0.0,0.0
4,0.0,2.0,8.0,11.0,23.0,32.0,34.0,36.0,38.0,25.67,12.5,1.21,67.0,140.0,258.0,1.0


In [11]:
train_df = df.iloc[:train_size]
dev_df = df.iloc[train_size:]

assert train_df.shape[0] == train_size
assert dev_df.shape[0] == dev_size

In [12]:
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(train_df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [13]:
train_df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']] = min_max_scaler.transform(train_df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']])
dev_df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']] = min_max_scaler.transform(dev_df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

In [14]:
X_train = train_df.drop('A16', axis=1).values
y_train = train_df['A16'].values


X_dev = dev_df.drop('A16', axis=1).values
y_dev = dev_df['A16'].values

In [15]:
X_train = np.expand_dims(X_train, axis=-1)
X_dev = np.expand_dims(X_dev, axis=-1)

## Model

In [16]:
print('Examples:{}    Features:{}'.format(X_train.shape[0], X_train.shape[1]))

Examples:483    Features:15


In [17]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
                
        encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, ninp)
        
        def sample(batch):
            batch_sampling = []
            for row in batch:                
                row_sampling = []
                for feature in row:
                    row_sampling.append(torch.normal(feature, 0.01, size=(ninp, )))
                    
                batch_sampling.append(torch.stack(row_sampling))
            return torch.stack(batch_sampling)
        
        self.sampling = sample
        
        self.decoder = nn.Linear(600, 1)
        
        self.activation = nn.Sigmoid()
        
    
    def forward(self, src):
        src_cat = self.embedding(src[:, :9].squeeze().long())
        src_num = self.sampling(src[:, 9:].squeeze())
        src = torch.cat((src_cat, src_num), 1)
        src = src.transpose(0, 1)
        
        output = self.transformer_encoder(src).transpose(0, 1)
        output = torch.flatten(output, start_dim=1)
        output = self.decoder(output)
        output = self.activation(output)
        
        return output

In [18]:
model = TransformerModel(
    42, # The size of vocabulary
    40, # Embedding size 
    4, # The number of heads in the multiheadattention models 
    64, # The dimension of the feedforward network model in nn.TransformerEncoder
    1, # The number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    dropout=0.2)

In [19]:
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train).float()

X_dev = torch.Tensor(X_dev)
y_dev = torch.Tensor(y_dev).float()

In [20]:
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=4)

dev_dataset = TensorDataset(X_dev, y_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=4) # create your dataloader

In [21]:
EPOCHS = 50
LR = 1e-4
CHECKPOINT = 'transformer.pt'

In [22]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [23]:
best_dev_loss = 10e18
best_dev_epoch = 1

for epoch in range(EPOCHS):  # loop over the dataset multiple times

    ############## Train
    model.train()
    tr_loss = 0.0
    t = tqdm(enumerate(train_dataloader, 0), desc='Progress')
    for i, data in t:
        t.set_postfix({
            'Epoch': epoch + 1,
            'Batch': i + 1, 
            'Train loss': tr_loss / (i + 1)
        })
    
        inputs, labels = data
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs.flatten(), labels)
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        
    ############## Validation
    model.eval()
    dev_loss = 0.0
    t = tqdm(enumerate(dev_dataloader, 0), desc='Progress')
    for i, data in t:
        t.set_postfix({
            'Epoch': epoch + 1,
            'Batch': i + 1, 
            'Dev loss': dev_loss / (i + 1)
        })
    
        inputs, labels = data
        
        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs.flatten(), labels)
            dev_loss += loss.item()
            
    if dev_loss < best_dev_loss:
        best_dev_epoch = epoch
        best_dev_loss = dev_loss
        torch.save(model.state_dict(), CHECKPOINT)
        
print('Finished Training. Best dev loss: {}. Epoch: {}.'.format(best_dev_loss / (len(dev_dataloader)), best_dev_epoch))

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…


Finished Training. Best dev loss: 0.34737541089550805. Epoch: 4.


In [24]:
model.load_state_dict(torch.load(CHECKPOINT))

<All keys matched successfully>

In [25]:
model.eval()

with torch.no_grad():
    # Train
    y_train_pred = np.round(model(X_train))
    train_cm = confusion_matrix(y_train, y_train_pred)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    # Validation
    y_dev_pred = np.round(model(X_dev))
    dev_cm = confusion_matrix(y_dev, y_dev_pred)
    dev_acc = accuracy_score(y_dev, y_dev_pred)

In [26]:
train_cm, train_acc

(array([[219,  49],
        [ 16, 199]]),
 0.865424430641822)

In [27]:
dev_cm, dev_acc

(array([[93, 22],
        [ 8, 84]]),
 0.855072463768116)