In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/MyDrive/Data/docnli")
os.getcwd()

'/content/drive/MyDrive/Data/docnli'

In [None]:
!pip install spacy spacy-experimental
!python -m spacy download en_core_web_lg
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl
!pip install torch_geometric

In [4]:
!python -m spacy validate

2023-12-19 01:10:04.551986: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-19 01:10:04.552038: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-19 01:10:04.553516: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-19 01:10:04.561806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-19 01:10:07.631533: I external/local_

In [None]:
!python -m spacy download en_core_web_lg

In [6]:
import spacy
# spacy.download('en_core_web_sm')
nlp = spacy.load("en_core_web_lg")
nlp_coref = spacy.load("en_coreference_web_trf")

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import pickle
import json
import random
import numpy as np
from itertools import permutations
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm
from transformers import BertModel, AutoTokenizer

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [9]:
'''
Variable Declaration
'''

train_data_path = 'data/docnli/train_5sent_50ksample.json'
dev_data_path = 'data/docnli/dev_5sent_10ksample.json'

save_dir = 'data/graph'
train_data_output_dir = 'data/graph/train'
dev_data_output_dir = 'data/graph/dev'
batch_size = 2

model_name = 'bert-base-cased'
tokenizer_name = 'bert-base-cased'

In [10]:
'''
Folder Creation
'''

if not os.path.exists(save_dir) :
    os.mkdir(save_dir)

if not os.path.exists(train_data_output_dir) :
    os.mkdir(train_data_output_dir)

if not os.path.exists(dev_data_output_dir) :
    os.mkdir(dev_data_output_dir)

In [11]:
'''
Load Models
'''

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

model = model.to("cuda")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [12]:
'''
Function Definition
'''

def get_sent_num(doc, coref_span) :

    for sent_idx, sent in enumerate(doc.sents):
        if (coref_span.start >= sent.start) and (coref_span.end <= sent.end):
            return sent_idx

def get_adj_matrix(doc , coref_doc , max_number_sentences=10) :

    adjacency_matrix = np.zeros((max_number_sentences,max_number_sentences))
    adjacencies = []

    for cluster, spans in coref_doc.spans.items() :
        adjacencies.append([get_sent_num(doc, span)
                            for span in spans
                            if get_sent_num(doc, span) != None])

    for adjacency in adjacencies :
        for c in permutations(adjacency, 2) :
            adjacency_matrix[c] += 1

    return adjacency_matrix



def save_dict_as_pickle(d , save_path) :
    with open(save_path, 'wb') as f :
        pickle.dump(d, f)

def get_cls(text) :

    tokens = tokenizer(text, padding='max_length' , truncation=True, return_tensors='pt')

    input_ids = tokens.input_ids.to("cuda")
    attention_mask = tokens.attention_mask.to("cuda")

    cls = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]
    cls = cls.cpu().detach().numpy()

    return cls

def get_sentence_lvl_repr(text, max_num_sentences=5 , model_dim=768) :

    sentence_lvl_repr = np.zeros((max_num_sentences, model_dim))

    # sent_tokenized_text = sent_tokenize(text)
    sent_tokenized_text = [sent.text for sent in text.sents]
    if len(sent_tokenized_text) > max_num_sentences :
        print(f'Encountered sent with >{max_num_sentences} sentences')
        sent_tokenized_text = random.sample(sent_tokenized_text, k=max_num_sentences)

    cls = get_cls(sent_tokenized_text)
    sentence_lvl_repr[:cls.shape[0]] = cls

    return sentence_lvl_repr

In [13]:
'''
Read data
'''

train_data = json.load(open(train_data_path))
dev_data = json.load(open(dev_data_path))

max_num_sentences = 10

In [None]:
start_index = len(os.listdir(train_data_output_dir))


for data in tqdm(train_data[start_index:]) :

    try :

        text = data['premise'] + data['hypothesis']
        doc = nlp(text)
        coref = nlp_coref(text)

        sent_tokenized_text = [sent.text for sent in doc.sents]
        if len(sent_tokenized_text) > max_num_sentences :
            print(f'Encountered sent with >{max_num_sentences} sentences')
            sent_tokenized_text = random.sample(sent_tokenized_text,
                                                k=max_num_sentences)
            doc = nlp(' '.join(sent_tokenized_text))


        adjacency_matrix = get_adj_matrix(doc , coref , max_number_sentences=10)
        repr = get_sentence_lvl_repr(doc, max_num_sentences=10 , model_dim=768)

        data['adj_matrix'] = adjacency_matrix
        data['repr'] = repr

        save_path = os.path.join(train_data_output_dir, f"{data['id']}.pkl")
        save_dict_as_pickle(data , save_path)

    except Exception as e :
        print(e)

In [14]:
start_index = len(os.listdir(dev_data_output_dir))


for data in tqdm(dev_data[start_index:]) :

    try :

        text = data['premise'] + data['hypothesis']
        doc = nlp(text)
        coref = nlp_coref(text)

        sent_tokenized_text = [sent.text for sent in doc.sents]
        if len(sent_tokenized_text) > max_num_sentences :
            print(f'Encountered sent with >{max_num_sentences} sentences')
            sent_tokenized_text = random.sample(sent_tokenized_text,
                                                k=max_num_sentences)
            doc = nlp(' '.join(sent_tokenized_text))

        adjacency_matrix = get_adj_matrix(doc , coref , max_number_sentences=10)
        repr = get_sentence_lvl_repr(doc, max_num_sentences=10 , model_dim=768)

        data['adj_matrix'] = adjacency_matrix
        data['repr'] = repr

        save_path = os.path.join(dev_data_output_dir, f"{data['id']}.pkl")
        save_dict_as_pickle(data , save_path)

    except Exception as e :
        print(e)


  0%|          | 0/10000 [00:00<?, ?it/s]

Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
index 10 is out of bounds for axis 1 with size 10
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences


Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 sentences
Encountered sent with >10 se

In [None]:
# class GCN(torch.nn.Module):
#     def __init__(self, hidden_channels):
#         super().__init__()
#         torch.manual_seed(1234567)
#         self.conv1 = GCNConv(768, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, 1)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index)
#         x = x.relu()
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index)
#         return x

In [None]:
# gcn = GCN(256)

True

In [None]:
d = nlp("Hello world. How are you doing today")

In [None]:
t = [a.text for a in d.sents]

In [None]:
t

['Hello world.', 'How are you doing today']

In [None]:
type(nlp(' '.join(t)))

spacy.tokens.doc.Doc

In [15]:
len(os.listdir(dev_data_output_dir))

9997

In [16]:
len(os.listdir(train_data_output_dir))

49867