#### Author: Alexis Geslin

In [1]:
import re
import time
import glob
import pandas as pd
import numpy as np
import argparse
from statistics import stdev
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/


/content/drive/MyDrive/cours/cs224n/project/LLM-Prop


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('-'*20)
    print(f'I have {torch.cuda.device_count()} devices, currently on {torch.cuda.current_device()}')
    print('-'*20)
else:
    print('-'*20)
    print("You are running on CPU only")
    print('-'*20)
    device = torch.device("cpu")

--------------------
I have 1 devices, currently on 0
--------------------


In [5]:
train_labels_data = pd.read_csv(f'./embeddings/pred_labels_train_train15000_200epoch_125098.csv')
valid_labels_data = pd.read_csv(f'./embeddings/pred_labels_valid_train15000_200epoch_9945.csv')
test_labels_data = pd.read_csv(f'./embeddings/pred_labels_test_train15000_200epoch_11531.csv')

train_embeddings  = pd.read_csv(f'./embeddings/embeddings_train_train15000_200epoch_125098.csv', header=None)
valid_embeddings  = pd.read_csv(f'./embeddings/embeddings_valid_train15000_200epoch_9945.csv', header=None)
test_embeddings  = pd.read_csv(f'./embeddings/embeddings_test_train15000_200epoch_11531.csv', header=None)

train_numerical_tokens = pd.read_csv(f'./embeddings/train_numerical_tokens_5_3_1.csv', header=0)
valid_numerical_tokens = pd.read_csv(f'./embeddings/valid_numerical_tokens_5_3_1.csv', header=0)
test_numerical_tokens = pd.read_csv(f'./embeddings/test_numerical_tokens_5_3_1.csv', header=0)

In [6]:
#printing shapes
print(train_labels_data.shape, valid_labels_data.shape, test_labels_data.shape)
print(train_embeddings.shape, valid_embeddings.shape, test_embeddings.shape)
print(train_numerical_tokens.shape, valid_numerical_tokens.shape, test_numerical_tokens.shape)

(125098, 2) (9945, 2) (11531, 2)
(125098, 512) (9945, 512) (11531, 512)
(125098, 9) (9945, 9) (11531, 9)


In [7]:
#concat embeddings and numerical tokens and make it a tensor
X_train = torch.tensor(np.concatenate((train_embeddings, train_numerical_tokens), axis=1),device=device).float()
X_valid = torch.tensor(np.concatenate((valid_embeddings, valid_numerical_tokens), axis=1),device=device).float()
X_test = torch.tensor(np.concatenate((test_embeddings, test_numerical_tokens), axis=1),device=device).float()

Y_train = torch.tensor(train_labels_data.label.values,device=device).float().reshape(-1,1)
Y_valid = torch.tensor(valid_labels_data.label.values,device=device).float().reshape(-1,1)
Y_test = torch.tensor(test_labels_data.label.values,device=device).float().reshape(-1,1)

#print shapes
print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)
print(X_test.shape, Y_test.shape)

torch.Size([125098, 521]) torch.Size([125098, 1])
torch.Size([9945, 521]) torch.Size([9945, 1])
torch.Size([11531, 521]) torch.Size([11531, 1])


In [8]:
dim_embeddings = train_embeddings.shape[1]
dim_numerical = train_numerical_tokens.shape[1]
mae_loss_function = nn.L1Loss()


In [26]:

X_train_mini = X_train[:10000]
Y_train_mini = Y_train[:10000]

bs = 256 if torch.cuda.is_available() else 32
print(bs)

train_dataset = torch.utils.data.TensorDataset(X_train, Y_train)
train_dataset_mini = torch.utils.data.TensorDataset(X_train_mini, Y_train_mini)
valid_dataset = torch.utils.data.TensorDataset(X_valid, Y_valid)
test_dataset = torch.utils.data.TensorDataset(X_test, Y_test)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=bs, shuffle=False)
train_dataloader_mini = torch.utils.data.DataLoader(train_dataset_mini, batch_size=bs, shuffle=False)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=bs, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=bs, shuffle=False)

256


In [10]:
class myDualmodel(nn.Module):
    def __init__(self, embeddings_dim,num_dim,drop_rate=0.3):
        super(myDualmodel, self).__init__()
        self.num_dim = num_dim
        self.embeddings_dim = embeddings_dim

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(embeddings_dim, 32),
            nn.ReLU()
        )

        self.MLP_numerical = nn.Sequential(
            nn.Linear(num_dim, 32),
            nn.Dropout(drop_rate),
            nn.ReLU()
        )

        self.MLP_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(drop_rate),
            nn.Linear(32, 1)
        )

    def forward(self, x1, x2):
        x1_out = self.MLP_embedding(x1)
        x2_out = self.MLP_numerical(x2)
        x_out = torch.cat((x1_out, x2_out), dim=1)
        x_out = self.MLP_head(x_out)
        return x_out

In [21]:
def init_weights(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

class ResidualBlock(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.fc = nn.Linear(in_features, in_features)
        self.bn = nn.BatchNorm1d(in_features)

    def forward(self, x):
        return nn.functional.relu(self.bn(self.fc(x)) + x)

class ResidualBlock2(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.fc = nn.Linear(in_features, in_features)
        self.bn = nn.BatchNorm1d(in_features)

    def forward(self, x):
        return self.bn(self.fc(x)) + x

class myDualmodel2(nn.Module):
    def __init__(self, embeddings_dim,num_dim,drop_rate=0.3):
        super(myDualmodel2, self).__init__()
        self.num_dim = num_dim
        self.embeddings_dim = embeddings_dim

        self.MLP_embedding = nn.Sequential(
            nn.BatchNorm1d(num_features=embeddings_dim),
            nn.Linear(embeddings_dim, 32),
            nn.ReLU(),
            nn.Dropout(drop_rate),
        )

        self.MLP_numerical = nn.Sequential(
            nn.BatchNorm1d(num_features=num_dim),
            nn.Linear(num_dim, 32),
            nn.ReLU()
        )

        self.MLP_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(drop_rate),
            nn.Linear(32, 1)
        )

    def forward(self, x1, x2):
        x1_out = self.MLP_embedding(x1)
        x2_out = self.MLP_numerical(x2)
        x_out = torch.cat((x1_out, x2_out), dim=1)
        x_out = self.MLP_head(x_out)
        return x_out

class myDualmodel3(nn.Module):
    def __init__(self, embeddings_dim,num_dim,drop_rate=0.3):
        super(myDualmodel3, self).__init__()
        self.num_dim = num_dim
        self.embeddings_dim = embeddings_dim

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock(embeddings_dim),
            nn.LeakyReLU(),
            nn.Linear(embeddings_dim, 32),
            nn.LeakyReLU()
        )

        self.MLP_numerical = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock(num_dim),
            nn.LeakyReLU(),
            nn.Linear(num_dim, 32),
            nn.Dropout(drop_rate),
            nn.ReLU()
        )

        self.MLP_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(drop_rate),
            nn.Linear(32, 1)
        )

        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')
        for m in self.MLP_numerical:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')

        for m in self.MLP_head:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, x1, x2):
        x1_out = self.MLP_embedding(x1)
        x2_out = self.MLP_numerical(x2)
        x_out = torch.cat((x1_out, x2_out), dim=1)
        x_out = self.MLP_head(x_out)
        return x_out



class myDualmodel4(nn.Module):
    def __init__(self, embeddings_dim,num_dim,drop_rate=0.3):
        super(myDualmodel4, self).__init__()
        self.num_dim = num_dim
        self.embeddings_dim = embeddings_dim

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock(embeddings_dim),
            nn.LeakyReLU(),
            ResidualBlock(embeddings_dim),
            nn.LeakyReLU(),
            nn.Linear(embeddings_dim, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 32),
            nn.LeakyReLU()
        )

        self.MLP_numerical = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock(num_dim),
            nn.LeakyReLU(),
            nn.Linear(num_dim, 64),
            nn.LeakyReLU(),
            ResidualBlock(64),
            nn.LeakyReLU(),
            nn.Linear(64, 32),
            nn.Dropout(drop_rate),
            nn.ReLU()
        )

        self.MLP_head = nn.Sequential(
            ResidualBlock(64),
            nn.LeakyReLU(),
            nn.Linear(64, 32),
            nn.LeakyReLU(),
            nn.Dropout(drop_rate),
            nn.Linear(32, 1)
        )

        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')
        for m in self.MLP_numerical:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')

        for m in self.MLP_head:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, x1, x2):
        x1_out = self.MLP_embedding(x1)
        x2_out = self.MLP_numerical(x2)
        x_out = torch.cat((x1_out, x2_out), dim=1)
        x_out = self.MLP_head(x_out)
        return x_out

class myDualmodel5(nn.Module):
    def __init__(self, embeddings_dim,num_dim,drop_rate=0.3):
        super(myDualmodel5, self).__init__()
        self.num_dim = num_dim
        self.embeddings_dim = embeddings_dim

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock2(embeddings_dim),
            nn.LeakyReLU(),
            nn.Linear(embeddings_dim, 32),
            nn.LeakyReLU()
        )

        self.MLP_numerical = nn.Sequential(
            nn.Dropout(drop_rate),
            ResidualBlock2(num_dim),
            nn.LeakyReLU(),
            nn.Linear(num_dim, 32),
            nn.LeakyReLU(),
            nn.Dropout(drop_rate)
        )

        self.MLP_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.LeakyReLU(),
            nn.Dropout(drop_rate),
            nn.Linear(32, 1)
        )

        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')
        for m in self.MLP_numerical:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')

        for m in self.MLP_head:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, x1, x2):
        x1_out = self.MLP_embedding(x1)
        x2_out = self.MLP_numerical(x2)
        x_out = torch.cat((x1_out, x2_out), dim=1)
        x_out = self.MLP_head(x_out)
        return x_out



In [12]:
# train_results = pd.DataFrame(index = np.arange(30))
# valid_results = pd.DataFrame(index = np.arange(30))

In [28]:
model = myDualmodel5(dim_embeddings,dim_numerical,drop_rate=0.1)
model.to(device)
best_valid_loss = 5


optimizer = optim.Adam(model.parameters(), lr=0.0015)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
epochs = 100
my_training_loader = train_dataloader

initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)


In [29]:
# optimizer = optim.Adam(model.parameters(), lr=0.00015,weight_decay=0.01)


train_recording = []
valid_recording =[]


for epoch in range(epochs):
    model.train()
    train_loss = 0
    for i, (x, y) in enumerate(my_training_loader):
        optimizer.zero_grad()
        y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        loss = mae_loss_function(y_pred, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        # if i % 10000 == 0:
    train_loss /= len(my_training_loader)
    train_recording.append(train_loss)
    print(f'Epoch {epoch}, Training Loss {train_loss}')

    model.eval()
    valid_preds = []
    valid_labels = []
    with torch.no_grad():
        for i, (x, y) in enumerate(valid_dataloader):
            y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
            valid_preds.append(y_pred)
            valid_labels.append(y)
    valid_preds = torch.cat(valid_preds)
    valid_labels = torch.cat(valid_labels)
    valid_loss = mae_loss_function(valid_preds, valid_labels)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './AG_checkpoints/best_model.pth')
        print("Saving a new best model")
    scheduler.step(best_valid_loss)
    valid_recording.append(best_valid_loss.item())
    print(f'Epoch {epoch}, Validation Loss {valid_loss.item()}')


Epoch 0, Training Loss 1.5202254071557448
Saving a new best model
Epoch 0, Validation Loss 0.3910294771194458
Epoch 1, Training Loss 0.40574746196499145
Saving a new best model
Epoch 1, Validation Loss 0.35130855441093445
Epoch 2, Training Loss 0.37102817035525854
Saving a new best model
Epoch 2, Validation Loss 0.3508126437664032
Epoch 3, Training Loss 0.36003783994291455
Saving a new best model
Epoch 3, Validation Loss 0.3373352885246277
Epoch 4, Training Loss 0.3521557155470907
Epoch 4, Validation Loss 0.3423433005809784
Epoch 5, Training Loss 0.34818900742230974
Epoch 5, Validation Loss 0.3378608226776123
Epoch 6, Training Loss 0.3445480981639802
Epoch 6, Validation Loss 0.34153681993484497
Epoch 7, Training Loss 0.34211749332082786
Saving a new best model
Epoch 7, Validation Loss 0.3364161252975464
Epoch 8, Training Loss 0.33910858460486787
Saving a new best model
Epoch 8, Validation Loss 0.3349840044975281
Epoch 9, Training Loss 0.33819935725266703
Epoch 9, Validation Loss 0.3354

In [30]:
best_model = myDualmodel5(dim_embeddings,dim_numerical)
best_model.to(device)
best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))

best_model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for i, (x, y) in enumerate(test_dataloader):
        y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        test_preds.append(y_pred)
        test_labels.append(y)
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    test_loss = mae_loss_function(test_preds, test_labels)
    print(f'Test Loss {round(test_loss.item(),3)}')

initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)
Test Loss 0.347


  best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))


## Testing best model ever

In [32]:
best_model = myDualmodel5(dim_embeddings,dim_numerical)
best_model.to(device)
best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))



best_model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for i, (x, y) in enumerate(test_dataloader):
        y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        test_preds.append(y_pred)
        test_labels.append(y)
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    test_loss = mae_loss_function(test_preds, test_labels)
    print(f'Test Loss {round(test_loss.item(),3)}')


#validation
best_model.eval()
with torch.no_grad():
    valid_preds = []
    valid_labels = []
    for i, (x, y) in enumerate(valid_dataloader):
        y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        valid_preds.append(y_pred)
        valid_labels.append(y)
    valid_preds = torch.cat(valid_preds)
    valid_labels = torch.cat(valid_labels)
    valid_loss = mae_loss_function(valid_preds, valid_labels)
    print(f'Valid Loss {round(valid_loss.item(),3)}')


#training
best_model.train()
with torch.no_grad():
    train_preds = []
    train_labels = []
    for i, (x, y) in enumerate(train_dataloader):
        y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        train_preds.append(y_pred)
        train_labels.append(y)
    train_preds = torch.cat(train_preds)
    train_labels = torch.cat(train_labels)
    train_loss = mae_loss_function(train_preds, train_labels)
    print(f'Train Loss {round(train_loss.item(),3)}')



initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)
Test Loss 0.347


  best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))


Valid Loss 0.327
Train Loss 0.348


## Dropout experiment results gathering

In [68]:
chosen_dr = [0,0.1,0.3,0.5,0.7]
num_epochs = 30
train_results = pd.DataFrame(index = np.arange(num_epochs))
valid_results = pd.DataFrame(index = np.arange(num_epochs))


In [69]:
for dr in chosen_dr:
  print(f"doing dropout rate {dr}")
  print('-'*50)

  #load the models and its hyperparams
  model = myDualmodel3(dim_embeddings,dim_numerical,drop_rate=dr)
  model.to(device)
  best_valid_loss = 5
  optimizer = optim.Adam(model.parameters(), lr=0.0015)
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
  epochs = num_epochs
  my_training_loader = train_dataloader

  #initiating our lists for recording
  train_recording = []
  valid_recording =[]

  #train
  for epoch in range(epochs):
      model.train()
      train_loss = 0
      for i, (x, y) in enumerate(my_training_loader):
          optimizer.zero_grad()
          y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
          loss = mae_loss_function(y_pred, y)
          train_loss += loss.item()
          loss.backward()
          optimizer.step()
          # if i % 10000 == 0:
      train_loss /= len(my_training_loader)
      train_recording.append(train_loss)
      print(f'Epoch {epoch}, Training Loss {train_loss}')

      model.eval()
      valid_preds = []
      valid_labels = []
      with torch.no_grad():
          for i, (x, y) in enumerate(valid_dataloader):
              y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
              valid_preds.append(y_pred)
              valid_labels.append(y)
      valid_preds = torch.cat(valid_preds)
      valid_labels = torch.cat(valid_labels)
      valid_loss = mae_loss_function(valid_preds, valid_labels)
      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), './AG_checkpoints/best_model.pth')
          print("Saving a new best model")
      scheduler.step(best_valid_loss)
      valid_recording.append(best_valid_loss.item())
      print(f'Epoch {epoch}, Validation Loss {valid_loss.item()}')

  #populating the dataframes
  train_results[str(dr)] = train_recording
  valid_results[str(dr)] = valid_recording

doing dropout rate 0
--------------------------------------------------
initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)




Epoch 0, Training Loss 0.6325349856495613
Saving a new best model
Epoch 0, Validation Loss 0.4965791404247284
Epoch 1, Training Loss 0.4174899172136876
Saving a new best model
Epoch 1, Validation Loss 0.3955647647380829
Epoch 2, Training Loss 0.39557726943297866
Saving a new best model
Epoch 2, Validation Loss 0.37607163190841675
Epoch 3, Training Loss 0.37995777426931265
Epoch 3, Validation Loss 0.42486509680747986
Epoch 4, Training Loss 0.38252044239049066
Epoch 4, Validation Loss 0.4446166455745697
Epoch 5, Training Loss 0.3778694518565644
Epoch 5, Validation Loss 0.41750410199165344
Epoch 6, Training Loss 0.3678942091290692
Epoch 6, Validation Loss 0.4219253659248352
Epoch 7, Training Loss 0.3661458725852469
Epoch 7, Validation Loss 0.4265061020851135
Epoch 8, Training Loss 0.35839695238140706
Epoch 8, Validation Loss 0.43299445509910583
Epoch 9, Training Loss 0.3461207546948899
Saving a new best model
Epoch 9, Validation Loss 0.3550279140472412
Epoch 10, Training Loss 0.3376966331



Epoch 0, Training Loss 1.1702557227484776
Saving a new best model
Epoch 0, Validation Loss 0.36298882961273193
Epoch 1, Training Loss 0.3944293512583754
Saving a new best model
Epoch 1, Validation Loss 0.3531750738620758
Epoch 2, Training Loss 0.38297799851265424
Epoch 2, Validation Loss 0.35537591576576233
Epoch 3, Training Loss 0.380289577831398
Saving a new best model
Epoch 3, Validation Loss 0.3483071029186249
Epoch 4, Training Loss 0.37853303949342915
Epoch 4, Validation Loss 0.3557131290435791
Epoch 5, Training Loss 0.37831311381911453
Saving a new best model
Epoch 5, Validation Loss 0.33821117877960205
Epoch 6, Training Loss 0.37435484130567814
Epoch 6, Validation Loss 0.34517523646354675
Epoch 7, Training Loss 0.3763316992388425
Epoch 7, Validation Loss 0.35057333111763
Epoch 8, Training Loss 0.3729814965818801
Epoch 8, Validation Loss 0.35785090923309326
Epoch 9, Training Loss 0.37243408167350023
Epoch 9, Validation Loss 0.3431842029094696
Epoch 10, Training Loss 0.37283462641



Epoch 0, Training Loss 1.405675346195576
Saving a new best model
Epoch 0, Validation Loss 0.3496880829334259
Epoch 1, Training Loss 0.4536055251498895
Epoch 1, Validation Loss 0.38400447368621826
Epoch 2, Training Loss 0.44189240852016615
Epoch 2, Validation Loss 0.37217652797698975
Epoch 3, Training Loss 0.4388245259752547
Saving a new best model
Epoch 3, Validation Loss 0.347782164812088
Epoch 4, Training Loss 0.43459464085492133
Epoch 4, Validation Loss 0.361983060836792
Epoch 5, Training Loss 0.4349367919936014
Epoch 5, Validation Loss 0.35247620940208435
Epoch 6, Training Loss 0.4322215696541566
Epoch 6, Validation Loss 0.348047137260437
Epoch 7, Training Loss 0.4310197805212563
Saving a new best model
Epoch 7, Validation Loss 0.346542626619339
Epoch 8, Training Loss 0.43162898545616246
Epoch 8, Validation Loss 0.35078004002571106
Epoch 9, Training Loss 0.43074689552584067
Epoch 9, Validation Loss 0.349194198846817
Epoch 10, Training Loss 0.430480010883696
Epoch 10, Validation Los



Epoch 0, Training Loss 2.5568195060352608
Saving a new best model
Epoch 0, Validation Loss 0.38506606221199036
Epoch 1, Training Loss 0.5193821200015355
Epoch 1, Validation Loss 0.3981247842311859
Epoch 2, Training Loss 0.49437152668986095
Epoch 2, Validation Loss 0.4437078833580017
Epoch 3, Training Loss 0.4847427411557219
Epoch 3, Validation Loss 0.4170193076133728
Epoch 4, Training Loss 0.4820701071577326
Epoch 4, Validation Loss 0.4215184450149536
Epoch 5, Training Loss 0.47279399992985716
Saving a new best model
Epoch 5, Validation Loss 0.372670441865921
Epoch 6, Training Loss 0.470936687598443
Epoch 6, Validation Loss 0.4095340371131897
Epoch 7, Training Loss 0.47148287704629643
Epoch 7, Validation Loss 0.4066239297389984
Epoch 8, Training Loss 0.4676756662581352
Saving a new best model
Epoch 8, Validation Loss 0.36551621556282043
Epoch 9, Training Loss 0.4675393149043397
Epoch 9, Validation Loss 0.36647894978523254
Epoch 10, Training Loss 0.4685049998309227
Epoch 10, Validation 



Epoch 0, Training Loss 5.869405160530457
Saving a new best model
Epoch 0, Validation Loss 0.4676188826560974
Epoch 1, Training Loss 0.6660302631206552
Epoch 1, Validation Loss 0.47001758217811584
Epoch 2, Training Loss 0.6278243145328358
Epoch 2, Validation Loss 0.5532776117324829
Epoch 3, Training Loss 0.6046063870130629
Saving a new best model
Epoch 3, Validation Loss 0.45388346910476685
Epoch 4, Training Loss 0.586269042730819
Epoch 4, Validation Loss 0.4744335114955902
Epoch 5, Training Loss 0.5792560621273298
Epoch 5, Validation Loss 0.4993826448917389
Epoch 6, Training Loss 0.5736806837571423
Saving a new best model
Epoch 6, Validation Loss 0.4255141615867615
Epoch 7, Training Loss 0.571100195431758
Epoch 7, Validation Loss 0.4857250154018402
Epoch 8, Training Loss 0.5640749069566863
Epoch 8, Validation Loss 0.45575833320617676
Epoch 9, Training Loss 0.5669764809325673
Epoch 9, Validation Loss 0.4369828999042511
Epoch 10, Training Loss 0.5628135263432267
Epoch 10, Validation Loss

In [70]:
train_results.tail(2)

Unnamed: 0,0,0.1,0.3,0.5,0.7
28,0.298027,0.357821,0.407758,0.457181,0.55116
29,0.297132,0.357748,0.407837,0.458101,0.555635


In [71]:
valid_results.tail(2)

Unnamed: 0,0,0.1,0.3,0.5,0.7
28,0.328888,0.329762,0.331274,0.350054,0.385536
29,0.328888,0.329762,0.331274,0.349689,0.385536


In [72]:
train_results.to_pickle('./train_results2.pkl')
valid_results.to_pickle('./valid_results2.pkl')

In [None]:
# #test loss
# test_losses_dr = [0.38685,0.3915,0.3912]
# dr = [0.1,0.3,0.5,0.7]

## Running Batchsize experiment

In [22]:
bs_sizes = [32,64,128,256,512,1024]
bs_df  = pd.DataFrame(index = bs_sizes, columns=['train_loss','valid_loss','test_loss'])

In [23]:
for bs in bs_sizes:
    print(f'STARTING WITH BATCH SIZE {bs}')
    print("--"*50)

    #Making the dataloader
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=bs, shuffle=False)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=bs, shuffle=False)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=bs, shuffle=False)

    #initiating the model and optimizer
    model = myDualmodel5(dim_embeddings,dim_numerical,drop_rate=0.1)
    model.to(device)

    best_valid_loss = 5
    optimizer = optim.Adam(model.parameters(), lr=0.0015* (bs/128))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
    epochs = 30
    my_training_loader = train_dataloader

    #training
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, (x, y) in enumerate(my_training_loader):
            optimizer.zero_grad()
            y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
            loss = mae_loss_function(y_pred, y)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            # if i % 10000 == 0:
        train_loss /= len(my_training_loader)
        print(f'Epoch {epoch}, Training Loss {train_loss}')

        model.eval()
        valid_preds = []
        valid_labels = []
        with torch.no_grad():
            for i, (x, y) in enumerate(valid_dataloader):
                y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
                valid_preds.append(y_pred)
                valid_labels.append(y)
        valid_preds = torch.cat(valid_preds)
        valid_labels = torch.cat(valid_labels)
        valid_loss = mae_loss_function(valid_preds, valid_labels)
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'./AG_checkpoints/best_model_bs2_{str(bs)}.pth')
            print("Saving a new best model")
        scheduler.step(best_valid_loss)
        print(f'Epoch {epoch}, Validation Loss {valid_loss.item()}')


    #evaluation
    best_model = myDualmodel5(dim_embeddings,dim_numerical)
    best_model.to(device)

    best_model.load_state_dict(torch.load(f'./AG_checkpoints/best_model_bs2_{str(bs)}.pth'))
    best_model.eval()
    with torch.no_grad():
        test_preds = []
        test_labels = []
        for i, (x, y) in enumerate(test_dataloader):
            y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
            test_preds.append(y_pred)
            test_labels.append(y)
        test_preds = torch.cat(test_preds)
        test_labels = torch.cat(test_labels)
        test_loss = mae_loss_function(test_preds, test_labels)
        print(f'Test Loss {test_loss.item()}')

    #recording
    bs_df.loc[bs,'train_loss'] = train_loss
    bs_df.loc[bs,'valid_loss'] = valid_loss.item()
    bs_df.loc[bs,'test_loss'] = test_loss.item()

STARTING WITH BATCH SIZE 32
----------------------------------------------------------------------------------------------------
initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)




Epoch 0, Training Loss 1.0349548010303236
Saving a new best model
Epoch 0, Validation Loss 0.3630901277065277
Epoch 1, Training Loss 0.40797450280822145
Saving a new best model
Epoch 1, Validation Loss 0.3504640460014343
Epoch 2, Training Loss 0.3714826003174343
Epoch 2, Validation Loss 0.3598935604095459
Epoch 3, Training Loss 0.35380273364065096
Saving a new best model
Epoch 3, Validation Loss 0.34126463532447815
Epoch 4, Training Loss 0.34276575233758716
Epoch 4, Validation Loss 0.3471960127353668
Epoch 5, Training Loss 0.3376841849063897
Saving a new best model
Epoch 5, Validation Loss 0.33573079109191895
Epoch 6, Training Loss 0.3328779936284589
Saving a new best model
Epoch 6, Validation Loss 0.33557596802711487
Epoch 7, Training Loss 0.33109096046680075
Epoch 7, Validation Loss 0.3362908661365509
Epoch 8, Training Loss 0.32905256498004776
Epoch 8, Validation Loss 0.3407595753669739
Epoch 9, Training Loss 0.3267781442645794
Epoch 9, Validation Loss 0.3359006345272064
Epoch 10, Tr

  best_model.load_state_dict(torch.load(f'./AG_checkpoints/best_model_bs2_{str(bs)}.pth'))


Test Loss 0.35068660974502563
STARTING WITH BATCH SIZE 64
----------------------------------------------------------------------------------------------------
initialized Linear(in_features=512, out_features=32, bias=True)
initialized Linear(in_features=9, out_features=32, bias=True)
initialized Linear(in_features=64, out_features=32, bias=True)
initialized Linear(in_features=32, out_features=1, bias=True)
Epoch 0, Training Loss 0.9353209448211334
Saving a new best model
Epoch 0, Validation Loss 0.360471248626709
Epoch 1, Training Loss 0.39581731062792147
Saving a new best model
Epoch 1, Validation Loss 0.3543114960193634
Epoch 2, Training Loss 0.3747598690488149
Saving a new best model
Epoch 2, Validation Loss 0.34288033843040466
Epoch 3, Training Loss 0.3625934260656767
Epoch 3, Validation Loss 0.35596439242362976
Epoch 4, Training Loss 0.35425072055491036
Epoch 4, Validation Loss 0.3511887490749359
Epoch 5, Training Loss 0.3487266976090953
Saving a new best model
Epoch 5, Validation

In [24]:
bs_df

Unnamed: 0,train_loss,valid_loss,test_loss
32,0.311704,0.331152,0.350687
64,0.31301,0.328517,0.349622
128,0.312969,0.329269,0.349444
256,0.316306,0.330912,0.346489
512,0.323414,0.331122,0.350494
1024,0.324092,0.335916,0.355413


In [25]:
bs_df.to_pickle('./bs_df.pkl')