#### Author: Alexis Geslin

In [1]:
import re
import time
import glob
import pandas as pd
import numpy as np
import argparse
from statistics import stdev
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/


/content/drive/MyDrive/cours/cs224n/project/LLM-Prop


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('-'*20)
    print(f'I have {torch.cuda.device_count()} devices, currently on {torch.cuda.current_device()}')
    print('-'*20)
else:
    print('-'*20)
    print("You are running on CPU only")
    print('-'*20)
    device = torch.device("cpu")

--------------------
I have 1 devices, currently on 0
--------------------


In [5]:
train_labels_data = pd.read_csv(f'./embeddings/pred_labels_train_train15000_200epoch_125098.csv')
valid_labels_data = pd.read_csv(f'./embeddings/pred_labels_valid_train15000_200epoch_9945.csv')
test_labels_data = pd.read_csv(f'./embeddings/pred_labels_test_train15000_200epoch_11531.csv')

train_embeddings  = pd.read_csv(f'./embeddings/scibert_train_125098.csv', header=None)
valid_embeddings  = pd.read_csv(f'./embeddings/scibert_valid_9945.csv', header=None)
test_embeddings  = pd.read_csv(f'./embeddings/scibert_test_11531.csv', header=None)


In [6]:
#concat embeddings and numerical tokens and make it a tensor
X_train = torch.tensor(train_embeddings.values).float().to(device)
X_valid = torch.tensor(valid_embeddings.values).float().to(device)
X_test = torch.tensor(test_embeddings.values).float().to(device)


Y_train = torch.tensor(train_labels_data.label.values).float().reshape(-1,1).to(device)
Y_valid = torch.tensor(valid_labels_data.label.values).float().reshape(-1,1).to(device)
Y_test = torch.tensor(test_labels_data.label.values).float().reshape(-1,1).to(device)

#print shapes
print(X_train.shape, Y_train.shape,X_train.device,Y_train.device)
print(X_valid.shape, Y_valid.shape,X_valid.device,Y_valid.device)
print(X_test.shape, Y_test.shape,X_test.device,Y_test.device)

torch.Size([125098, 768]) torch.Size([125098, 1]) cuda:0 cuda:0
torch.Size([9945, 768]) torch.Size([9945, 1]) cuda:0 cuda:0
torch.Size([11531, 768]) torch.Size([11531, 1]) cuda:0 cuda:0


In [7]:

dim_embeddings = train_embeddings.shape[1]
mae_loss_function = nn.L1Loss()
mseloss = nn.MSELoss()


In [8]:
X_train_mini = X_train[:15000]
Y_train_mini = Y_train[:15000]
bs = 256 if torch.cuda.is_available() else 32
print(bs)

train_dataset = torch.utils.data.TensorDataset(X_train, Y_train)
train_dataset_mini = torch.utils.data.TensorDataset(X_train_mini, Y_train_mini)
valid_dataset = torch.utils.data.TensorDataset(X_valid, Y_valid)
test_dataset = torch.utils.data.TensorDataset(X_test, Y_test)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=bs, shuffle=False)
train_dataloader_mini = torch.utils.data.DataLoader(train_dataset_mini, batch_size=bs, shuffle=False)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=bs, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=bs, shuffle=False)

256


In [9]:

def init_weights(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

class ResidualBlock(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.fc = nn.Linear(in_features, in_features)
        self.bn = nn.BatchNorm1d(in_features)

    def forward(self, x):
        return nn.functional.relu(self.bn(self.fc(x)) + x)

class perceptronHead(nn.Module):
    def __init__(self, embeddings_dim,mydroprate = 0.1):
        super(perceptronHead, self).__init__()

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(mydroprate),
            nn.Linear(embeddings_dim, 512),
            nn.ReLU(),
            ResidualBlock(512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            ResidualBlock(128),
            nn.ReLU(),
            nn.Linear(128, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
        # initialize weights with Kaiming
        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, X):
        x_out = self.MLP_embedding(X)
        return x_out



class perceptronHead2(nn.Module):
    def __init__(self, embeddings_dim,mydroprate = 0.1):
        super(perceptronHead2, self).__init__()

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(mydroprate),
            ResidualBlock(embeddings_dim),
            nn.ReLU(),
            nn.Linear(embeddings_dim, 512),
            nn.ReLU(),
            ResidualBlock(512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
        )
        # initialize weights with Kaiming
        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, X):
        x_out = self.MLP_embedding(X)
        return x_out
class perceptronHead3(nn.Module):
    def __init__(self, embeddings_dim,mydroprate = 0.1):
        super(perceptronHead3, self).__init__()

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(mydroprate),
            ResidualBlock(embeddings_dim),
            nn.LeakyReLU(),
            nn.Linear(embeddings_dim, 512),
            nn.LeakyReLU(),
            nn.Dropout(mydroprate),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 1),
        )
        # initialize weights with Kaiming
        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, X):
        x_out = self.MLP_embedding(X)
        return x_out

class perceptronHead4(nn.Module):
    def __init__(self, embeddings_dim,mydroprate = 0.1):
        super(perceptronHead4, self).__init__()

        self.MLP_embedding = nn.Sequential(
            nn.Dropout(mydroprate),
            ResidualBlock(embeddings_dim),
            nn.LeakyReLU(),
            nn.Linear(embeddings_dim, 512),
            nn.LeakyReLU(),
            nn.Dropout(mydroprate),
            ResidualBlock(512),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
        # initialize weights with Kaiming
        for m in self.MLP_embedding:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                print(f'initialized {m}')


    def forward(self, X):
        x_out = self.MLP_embedding(X)
        return x_out


In [19]:
model = perceptronHead4(dim_embeddings,mydroprate=0.1)
model.to(device)

# optimizer = optim.Adam(model.parameters(), lr=0.0015,weight_decay=0.01)
optimizer = optim.Adam(model.parameters(), lr=0.0015)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
my_training_loader = train_dataloader_mini
my_training_loader = train_dataloader

initialized Linear(in_features=768, out_features=512, bias=True)
initialized Linear(in_features=512, out_features=256, bias=True)
initialized Linear(in_features=256, out_features=128, bias=True)
initialized Linear(in_features=128, out_features=64, bias=True)
initialized Linear(in_features=64, out_features=8, bias=True)
initialized Linear(in_features=8, out_features=1, bias=True)


In [20]:
epochs =100

best_valid_loss = 1000000000
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for i, (x, y) in enumerate(my_training_loader):
        optimizer.zero_grad()
        y_pred = model(x)
        loss = mae_loss_function(y_pred, y)
        train_loss += loss.item()
        before_update = {name: param.clone().detach() for name, param in model.named_parameters()}
        loss.backward()
        # for name, param in model.named_parameters():
        #     if param.grad is not None and torch.all(param.grad == 0):
        #         print(f"All-zero gradient for {name}")
        optimizer.step()

        # for name, param in model.named_parameters():
        #     if torch.equal(before_update[name], param):
        #         print(f"No update for {name}")
        # if i % 10000 == 0:
    train_loss /= len(my_training_loader)
    print(f'Epoch {epoch}, Training Loss {round((train_loss),3)}')




    model.eval()

    valid_preds = []
    valid_labels = []
    with torch.no_grad():
        for i, (x, y) in enumerate(valid_dataloader):
            y_pred = model(x)
            valid_preds.append(y_pred)
            # valid_preds.append(y_pred.detach().cpu())
            valid_labels.append(y)
    valid_preds = torch.cat(valid_preds)
    valid_labels = torch.cat(valid_labels)
    valid_loss = mae_loss_function(valid_preds, valid_labels)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './AG_checkpoints/best_model_scibert.pth')
        print("Saving a new best model")
    scheduler.step(best_valid_loss)
    print(f'Epoch {epoch}, Validation Loss {valid_loss.item()}')

Epoch 0, Training Loss 0.81
Saving a new best model
Epoch 0, Validation Loss 0.7249667048454285
Epoch 1, Training Loss 0.733
Saving a new best model
Epoch 1, Validation Loss 0.6730741262435913
Epoch 2, Training Loss 0.711
Saving a new best model
Epoch 2, Validation Loss 0.6491271257400513
Epoch 3, Training Loss 0.695
Epoch 3, Validation Loss 0.6510100960731506
Epoch 4, Training Loss 0.685
Saving a new best model
Epoch 4, Validation Loss 0.6390699148178101
Epoch 5, Training Loss 0.674
Saving a new best model
Epoch 5, Validation Loss 0.6273000836372375
Epoch 6, Training Loss 0.666
Epoch 6, Validation Loss 0.6296814680099487
Epoch 7, Training Loss 0.662
Saving a new best model
Epoch 7, Validation Loss 0.6252003312110901
Epoch 8, Training Loss 0.654
Saving a new best model
Epoch 8, Validation Loss 0.6141462922096252
Epoch 9, Training Loss 0.647
Epoch 9, Validation Loss 0.6208116412162781
Epoch 10, Training Loss 0.641
Saving a new best model
Epoch 10, Validation Loss 0.6103888154029846
Epoc

In [22]:
best_model = perceptronHead4(dim_embeddings)
best_model.to(device)
best_model.load_state_dict(torch.load('./AG_checkpoints/best_model_scibert.pth'))
# best_model = model

best_model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for i, (x, y) in enumerate(test_dataloader):
        y_pred = best_model(x)
        test_preds.append(y_pred)
        test_labels.append(y)
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    test_loss = mae_loss_function(test_preds, test_labels)
    print(f'Test Loss {test_loss.item()}')

initialized Linear(in_features=768, out_features=512, bias=True)
initialized Linear(in_features=512, out_features=256, bias=True)
initialized Linear(in_features=256, out_features=128, bias=True)
initialized Linear(in_features=128, out_features=64, bias=True)
initialized Linear(in_features=64, out_features=8, bias=True)
initialized Linear(in_features=8, out_features=1, bias=True)
Test Loss 0.5062068104743958


  best_model.load_state_dict(torch.load('./AG_checkpoints/best_model_scibert.pth'))


In [None]:
# prompt: # get the device best_model and x are on

print(f"Device for best_model and x: {device}")


Device for best_model and x: cuda


In [None]:
print(next(best_model.parameters()).device)

cuda:0


Training with scheduler, for 200 epochs,  lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

### TESTING BEST MODEL

In [11]:
#model loading
best_model = perceptronHead3(dim_embeddings)
best_model.to(device)
best_model.load_state_dict(torch.load(f'./AG_checkpoints/best_model_011_scibert.pth'))
# best_model.load_state_dict(torch.load(f'./AG_checkpoints/best_model_scibert.pth'))


best_model.train()
#train
with torch.no_grad():
    train_preds = []
    train_labels = []
    for i, (x, y) in enumerate(train_dataloader):
        y_pred = best_model(x)
        train_preds.append(y_pred)
        train_labels.append(y)
    train_preds = torch.cat(train_preds)
    train_labels = torch.cat(train_labels)
    train_loss = mae_loss_function(train_preds, train_labels)
    print(f'Training Loss {round(train_loss.item(),3)}')

#valid
best_model.eval()
with torch.no_grad():
    valid_preds = []
    valid_labels = []
    for i, (x, y) in enumerate(valid_dataloader):
        y_pred = best_model(x)
        valid_preds.append(y_pred)
        valid_labels.append(y)
    valid_preds = torch.cat(valid_preds)
    valid_labels = torch.cat(valid_labels)
    valid_loss = mae_loss_function(valid_preds, valid_labels)
    print(f'Validation Loss {round(valid_loss.item(),3)}')

#train
with torch.no_grad():
    test_preds = []
    test_labels = []
    for i, (x, y) in enumerate(test_dataloader):
        y_pred = best_model(x)
        test_preds.append(y_pred)
        test_labels.append(y)
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    test_loss = mae_loss_function(test_preds, test_labels)
    print(f'Test Loss {round(test_loss.item(),3)}')

initialized Linear(in_features=768, out_features=512, bias=True)
initialized Linear(in_features=512, out_features=128, bias=True)
initialized Linear(in_features=128, out_features=64, bias=True)
initialized Linear(in_features=64, out_features=1, bias=True)


  best_model.load_state_dict(torch.load(f'./AG_checkpoints/best_model_011_scibert.pth'))


Training Loss 0.416
Validation Loss 0.493
Test Loss 0.504


TESTING

In [None]:
best_model = myDualmodel2(dim_embeddings,dim_numerical)
best_model.to(device)

best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))
model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for i, (x, y) in enumerate(test_dataloader):
        y_pred = best_model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        test_preds.append(y_pred)
        test_labels.append(y)
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    test_loss = mae_loss_function(test_preds, test_labels)
    print(f'Test Loss {test_loss.item()}')

In [None]:
###OLD TRAINING

optimizer = optim.Adam(model.parameters(), lr=0.00015,weight_decay=0.01)
# optimizer = optim.Adam(model.parameters(), lr=0.000015)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
epochs = 25
my_training_loader = train_dataloader_mini

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for i, (x, y) in enumerate(my_training_loader):
        optimizer.zero_grad()
        y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
        loss = mae_loss_function(y_pred, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        # if i % 10000 == 0:
    train_loss /= len(my_training_loader)
    print(f'Epoch {epoch}, Training Loss {loss.item()}')

    model.eval()
    valid_preds = []
    valid_labels = []
    with torch.no_grad():
        for i, (x, y) in enumerate(valid_dataloader):
            y_pred = model(x[:,:dim_embeddings], x[:,dim_embeddings:])
            valid_preds.append(y_pred)
            valid_labels.append(y)
    valid_preds = torch.cat(valid_preds)
    valid_labels = torch.cat(valid_labels)
    valid_loss = mae_loss_function(valid_preds, valid_labels)
    scheduler.step(valid_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './AG_checkpoints/best_model.pth')
        print("Saving a new best model")
    print(f'Epoch {epoch}, Validation Loss {valid_loss.item()}')


Epoch 0, Training Loss 0.3552802801132202
Epoch 0, Validation Loss 0.35032400488853455
Epoch 1, Training Loss 0.23885880410671234
Epoch 1, Validation Loss 0.34637391567230225
Epoch 2, Training Loss 0.23440074920654297
Epoch 2, Validation Loss 0.3513180911540985
Epoch 3, Training Loss 0.1513446867465973
Epoch 3, Validation Loss 0.34749555587768555
Epoch 4, Training Loss 0.19549678266048431
Epoch 4, Validation Loss 0.3530590236186981
Epoch 5, Training Loss 0.24011489748954773
Epoch 5, Validation Loss 0.35078033804893494
Epoch 6, Training Loss 0.39733004570007324
Epoch 6, Validation Loss 0.3494114279747009
Epoch 7, Training Loss 0.4102326035499573
Epoch 7, Validation Loss 0.3571268916130066
Epoch 8, Training Loss 0.2927051782608032
Epoch 8, Validation Loss 0.3475220203399658
Epoch 9, Training Loss 0.23760411143302917
Epoch 9, Validation Loss 0.3474278450012207
Epoch 10, Training Loss 0.2625439763069153
Epoch 10, Validation Loss 0.34945937991142273
Epoch 11, Training Loss 0.305087089538574

  best_model.load_state_dict(torch.load('./AG_checkpoints/best_model.pth'))


Test Loss 0.49804964661598206
