In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
os.chdir("/content/drive/MyDrive/Data/docnli")
os.getcwd()

'/content/drive/MyDrive/Data/docnli'

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pickle
import json
import random
import numpy as np
from glob import glob

from tqdm.notebook import tqdm
from transformers import BertModel, AutoTokenizer

In [3]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/179.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [4]:
'''
Variable Declaration
'''

train_data_path = 'data/graph/train'
dev_data_path = 'data/graph/dev'

log_path = 'logs/graph_nn_train.log'
model_path = 'models/graph_nn.pt'

log_dir = 'logs/'
model_dir = 'models/'

batch_save_frequency = 20

max_num_sentences = 10
node_feature_size = 768
channel_size = 256
batch_size = 512
learning_rate = 4.2e-4
total_epochs = 5

In [5]:
'''
Folder Creation
'''
if not os.path.exists(model_dir) :
    os.mkdir(model_dir)


if not os.path.exists(log_dir) :
    os.mkdir(log_dir)

In [6]:
'''
Defining Graph
'''

import torch
import torch.nn.functional as F
from torch import nn

In [7]:
class GCN(nn.Module) :


    def __init__(self, in_shape, hidden_shape, out_shape, max_num_sentences=10) -> None:

        super(GCN, self).__init__()

        # ~~~~ Layers ~~~~
        self.input_layer = nn.Linear(in_shape, hidden_shape)
        self.output_layer = nn.Linear(hidden_shape, out_shape)
        self.mini_dense = nn.Linear(max_num_sentences, out_shape)

        # ~~~~ Activations ~~~~
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()


    def forward(self, x, adj_matrix) -> None :

        # First Graph Convolution
        x = self.input_layer(x)
        x = torch.bmm(adj_matrix , x)
        x = self.relu(x)
        x = F.normalize(x, dim=1)


        # Second Graph Convolution
        x = self.output_layer(x)
        x = torch.bmm(adj_matrix , x)
        x = self.relu(x)
        x = F.normalize(x, dim=1)

        # Dense to look at all nodes and figure importance
        x = torch.flatten(x , start_dim=1)
        x = self.mini_dense(x)
        x = self.sigmoid(x)

        return x

In [8]:
'''
Function Definition
'''

def read_data_from_disk(data_path) :
    return pickle.load(open(data_path , 'rb'))

def batch(iterable) :
    l = len(iterable)
    for i in range(0, l, batch_size) :
        yield iterable[i:min(l, i+batch_size)]

def save_dict_as_pickle(d , save_path) :
    with open(save_path, 'wb') as f :
        pickle.dump(d, f)


In [11]:
'''
Read Data
'''

train_data = [os.path.join(train_data_path, tdp) for tdp in os.listdir(train_data_path)]
train_data = [read_data_from_disk(data_path) for data_path in train_data]

In [12]:
from torch.optim import Adam
from torch.nn import BCELoss
from torcheval import metrics

model = GCN(in_shape=768, hidden_shape=256, out_shape=1)
model = model.to("cuda")

optimizer = Adam(model.parameters() , lr=learning_rate)

if os.path.exists(model_path) :
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    batch_start_epoch = checkpoint['batch_epoch']
    current_loss = checkpoint['loss']
else :
    start_epoch = 0
    batch_start_epoch = 0
    current_loss = 0.0

model.train()
loss = BCELoss()
f1_score = metrics.BinaryF1Score()

In [13]:
train_batches = [b for b in batch(train_data)]
print("Total number of batches : " , len(train_batches))
print("Start Epoch : " , start_epoch)
print("Start Batch Epoch : " , batch_start_epoch)
print("Current Loss : " , current_loss )
# print(optimizer.device)

Total number of batches :  98
Start Epoch :  0
Start Batch Epoch :  0
Current Loss :  0.0


In [None]:
print(train_batches[0])

In [None]:
for epoch in range(start_epoch , total_epochs) :
    for batch_epoch , batch_data in tqdm(enumerate(train_batches[batch_start_epoch:])) :

        # batch_data = [read_data_from_disk(data_path) for data_path in batch_]

        batch_repr = []
        batch_adj_matrix = []
        batch_labels = []

        for data in batch_data :
            batch_repr.append(data['repr'])
            batch_adj_matrix.append(data['adj_matrix'])

            if data['label'] == 'entailment' :
                batch_labels.append(1)
            else :
                batch_labels.append(0)

        batch_repr = torch.from_numpy(np.array(batch_repr)).to(torch.float32).to("cuda")
        batch_adj_matrix = torch.from_numpy(np.array(batch_adj_matrix)).to(torch.float32).to("cuda")
        batch_labels = torch.from_numpy(np.array(batch_labels)).to(torch.float32).to("cuda")

        optimizer.zero_grad()
        train_outputs = model(batch_repr, batch_adj_matrix)
        train_outputs = torch.flatten(train_outputs).to(torch.float32)

        train_loss = loss(train_outputs, batch_labels)
        train_loss.backward()
        optimizer.step()

        current_loss += train_loss.item()
        f1_score.update(train_outputs, batch_labels)

        if batch_epoch % batch_save_frequency == 0 :

            print(f'Batch Epoch : {batch_epoch} , \
            F1 Score : {f1_score.compute().item()} \
            Avg Batch Loss : {current_loss/(batch_epoch+1)}')

            log_dict = {"epoch": epoch,
                    "batch_epoch" : batch_epoch,
                    "f1_score" : f1_score.compute().item(),
                    "avg_batch_loss" : current_loss/(batch_epoch+1)}

            with open(log_path , 'a') as f :
                f.write(json.dumps(log_dict))
                f.write('\n')

            torch.save({
                'epoch': epoch,
                'batch_epoch': batch_epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': current_loss,
                },
                model_path)

    current_loss = 0
    batch_start_epoch = 0

In [28]:
f1_score.compute()

tensor(0.)

In [9]:
val_data = [os.path.join(dev_data_path, dp) for dp in os.listdir(dev_data_path)]
val_data = [read_data_from_disk(data_path) for data_path in val_data]

In [10]:
log_path = 'logs/graph_nn_val.log'

In [11]:
model = GCN(in_shape=768, hidden_shape=256, out_shape=1)
model = model.to("cuda")

checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [13]:
from torch.nn import BCELoss
from torcheval import metrics

val_batches = [b for b in batch(val_data)]
loss = BCELoss()
f1_score = metrics.BinaryF1Score()
current_loss = 0

print("Total number of batches : " , len(val_batches))

Total number of batches :  20


In [14]:
with torch.no_grad() :
    for batch_epoch , batch_data in tqdm(enumerate(val_batches)) :

        batch_repr = []
        batch_adj_matrix = []
        batch_labels = []

        for data in batch_data :
            batch_repr.append(data['repr'])
            batch_adj_matrix.append(data['adj_matrix'])

            if data['label'] == 'entailment' :
                batch_labels.append(1)
            else :
                batch_labels.append(0)


        batch_repr = torch.from_numpy(np.array(batch_repr)).to(torch.float32).to("cuda")
        batch_adj_matrix = torch.from_numpy(np.array(batch_adj_matrix)).to(torch.float32).to("cuda")
        batch_labels = torch.from_numpy(np.array(batch_labels)).to(torch.float32).to("cuda")

        val_outputs = model(batch_repr, batch_adj_matrix)
        val_outputs = torch.flatten(val_outputs).to(torch.float32)

        val_loss = loss(val_outputs, batch_labels)

        current_loss += val_loss
        f1_score.update(val_outputs, batch_labels)


print(f'F1 Score : {f1_score.compute().item()} \
        Avg Batch Loss : {current_loss/(batch_epoch+1)}')

log_dict = {
        "f1_score" : f1_score.compute().item(),
        "avg_batch_loss" : current_loss/(batch_epoch+1)}

with open(log_path , 'a') as f :
    f.write(json.dumps(log_dict))
    f.write('\n')

0it [00:00, ?it/s]

F1 Score : 0.6668890714645386         Avg Batch Loss : 0.6957718729972839


TypeError: ignored