In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from tqdm.notebook import tqdm
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score
import scipy
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f8f21cc9dd0>

In [2]:
train_df = pd.read_csv('n_train.csv')
dev_df = pd.read_csv('n_dev.csv')

## Preprocessing data

In [3]:
train_size = train_df.shape[0]
dev_size = dev_df.shape[0]

In [4]:
train_size, dev_size

(429, 185)

In [5]:
df = pd.concat([train_df, dev_df])

In [6]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
label_col = 'Loan_Status'

In [7]:
embedding_idx_data = {'Gender': {'Male': 0, 'Female': 1},
                'Married': {'No': 2, 'Yes': 3},
                'Dependents': {'0': 4, '1': 5, '2': 6, '3+': 7 },
                'Education': {'Graduate': 8, 'Not Graduate': 9},
                'Self_Employed': {'No': 10, 'Yes': 11},
                'Credit_History': {0: 12, 1: 13},
                'Property_Area': {'Urban': 14, 'Rural': 15, 'Semiurban': 16}
               }

In [8]:
df = df.replace(embedding_idx_data)

In [9]:
df[label_col] = df[label_col].replace({-1: 0})

In [10]:
# One hot encoding 
df = df[categorical_cols + numerical_cols + [label_col]]

In [11]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Loan_Status
0,0,3,6,8,10,13.0,14,2500.0,1840.0,109.0,360.0,1.0
1,0,2,4,8,10,13.0,16,5941.0,4232.0,296.0,360.0,1.0
2,0,3,7,9,10,12.0,16,4931.0,0.0,128.0,360.0,0.0
3,0,3,4,9,10,13.0,15,2894.0,2792.0,155.0,360.0,1.0
4,0,3,4,8,10,13.0,14,2500.0,3796.0,120.0,360.0,1.0


In [12]:
train_df = df.iloc[:train_size]
dev_df = df.iloc[train_size:]

assert train_df.shape[0] == train_size
assert dev_df.shape[0] == dev_size

In [13]:
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(train_df[numerical_cols])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [14]:
train_df[numerical_cols] = min_max_scaler.transform(train_df[numerical_cols])
dev_df[numerical_cols] = min_max_scaler.transform(dev_df[numerical_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [15]:
X_train = train_df.drop(label_col, axis=1).values
y_train = train_df[label_col].values


X_dev = dev_df.drop(label_col, axis=1).values
y_dev = dev_df[label_col].values

In [16]:
X_train = np.expand_dims(X_train, axis=-1)
X_dev = np.expand_dims(X_dev, axis=-1)

## Model

In [17]:
print('Examples:{}    Features:{}'.format(X_train.shape[0], X_train.shape[1]))

Examples:429    Features:11


In [18]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
                
        encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, ninp)
        
        self.num_embedding = nn.Linear(1, ninp)
        
        self.decoder = nn.Linear(ninp * 11, 1)
        
        self.activation = nn.Sigmoid()
        
    
    def forward(self, src):
        src_cat = self.embedding(src[:, :len(categorical_cols)].squeeze().long())
        src_num = self.num_embedding(src[:, len(categorical_cols):])
        src = torch.cat((src_cat, src_num), 1)
        src = src.transpose(0, 1)
        
        output = self.transformer_encoder(src).transpose(0, 1)
        output = torch.flatten(output, start_dim=1)
        output = self.decoder(output)
        output = self.activation(output)
        
        return output

In [19]:
model = TransformerModel(
    42, # The size of vocabulary
    3, # Embedding size 
    1, # The number of heads in the multiheadattention models 
    4, # The dimension of the feedforward network model in nn.TransformerEncoder
    1, # The number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    dropout=0.2)

In [20]:
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train).float()

X_dev = torch.Tensor(X_dev)
y_dev = torch.Tensor(y_dev).float()

In [21]:
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=3)

dev_dataset = TensorDataset(X_dev, y_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=3) # create your dataloader

In [22]:
EPOCHS = 50
LR = 1e-4
CHECKPOINT = 'simple_transformer.pt'

In [23]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [24]:
best_dev_loss = 10e18
best_dev_epoch = 1

for epoch in range(EPOCHS):  # loop over the dataset multiple times

    ############## Train
    model.train()
    tr_loss = 0.0
    t = tqdm(enumerate(train_dataloader, 0), desc='Progress')
    for i, data in t:
        t.set_postfix({
            'Epoch': epoch + 1,
            'Batch': i + 1, 
            'Train loss': tr_loss / (i + 1)
        })
    
        inputs, labels = data
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.flatten(), labels)
        
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
    ############## Validation
    model.eval()
    dev_loss = 0.0
    t = tqdm(enumerate(dev_dataloader, 0), desc='Progress')
    for i, data in t:
        t.set_postfix({
            'Epoch': epoch + 1,
            'Batch': i + 1, 
            'Dev loss': dev_loss / (i + 1)
        })
    
        inputs, labels = data
        
        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs.flatten(), labels)
            dev_loss += loss.item()
            
    if dev_loss < best_dev_loss:
        best_dev_epoch = epoch
        best_dev_loss = dev_loss
        torch.save(model.state_dict(), CHECKPOINT)
        
print('Finished Training. Best dev loss: {}. Epoch: {}.'.format(best_dev_loss / (len(dev_dataloader)), best_dev_epoch))

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Progress', max=1.0, style=ProgressStyle…


Finished Training. Best dev loss: 0.4994705182410056. Epoch: 49.


In [25]:
model.load_state_dict(torch.load(CHECKPOINT))

<All keys matched successfully>

In [26]:
model.eval()

with torch.no_grad():
    # Train
    y_train_pred = np.round(model(X_train))
    train_cm = confusion_matrix(y_train, y_train_pred)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    # Validation
    y_dev_pred = np.round(model(X_dev))
    dev_cm = confusion_matrix(y_dev, y_dev_pred)
    dev_acc = accuracy_score(y_dev, y_dev_pred)

In [27]:
train_cm, train_acc

(array([[ 56,  78],
        [  6, 289]]),
 0.8041958041958042)

In [28]:
dev_cm, dev_acc

(array([[ 27,  31],
        [  3, 124]]),
 0.8162162162162162)