In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/MyDrive/Data/docnli")
os.getcwd()

'/content/drive/MyDrive/Data/docnli'

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import pickle
import json
import random
import numpy as np
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm
from transformers import BertModel, AutoTokenizer

In [5]:
'''
Variable Declaration
'''

train_data_path = 'data/docnli/train_5sent_50ksample.json'
dev_data_path = 'data/docnli/dev_5sent_10ksample.json'

save_dir = 'data/bert'
train_data_output_dir = 'data/bert/train'
dev_data_output_dir = 'data/bert/dev'
batch_size = 2

model_name = 'bert-base-cased'
tokenizer_name = 'bert-base-cased'

In [6]:
'''
Folder Creation
'''

if not os.path.exists(save_dir) :
    os.mkdir(save_dir)

if not os.path.exists(train_data_output_dir) :
    os.mkdir(train_data_output_dir)

if not os.path.exists(dev_data_output_dir) :
    os.mkdir(dev_data_output_dir)

In [9]:
'''
Load Models
'''

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

model = model.to("cuda")

In [6]:
'''
Function Definition
'''

def save_dict_as_pickle(d , save_path) :
    with open(save_path, 'wb') as f :
        pickle.dump(d, f)

def get_cls(text) :

    tokens = tokenizer(text, padding='max_length' , truncation=True, return_tensors='pt')

    input_ids = tokens.input_ids.to("cuda")
    attention_mask = tokens.attention_mask.to("cuda")

    cls = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]
    cls = cls.cpu().detach().numpy()

    return cls

def get_sentence_lvl_repr(text, max_num_sentences=5 , model_dim=768) :

    sentence_lvl_repr = np.zeros((max_num_sentences, model_dim))

    sent_tokenized_text = sent_tokenize(text)
    if len(sent_tokenized_text) > max_num_sentences :
        print(f'Encountered sent with >{max_num_sentences} sentences')
        sent_tokenized_text = random.sample(sent_tokenized_text, k=max_num_sentences)

    cls = get_cls(sent_tokenized_text)
    sentence_lvl_repr[:cls.shape[0]] = cls

    return sentence_lvl_repr

In [7]:
'''
Read data
'''

train_data = json.load(open(train_data_path))
dev_data = json.load(open(dev_data_path))

In [None]:
start_index = len(os.listdir(train_data_output_dir))

for data in tqdm(train_data[start_index:]) :

    premise = data['premise']
    premise_repr = get_sentence_lvl_repr(premise)

    hypothesis = data['hypothesis']
    hypothesis_repr = get_sentence_lvl_repr(hypothesis)

    data['premise_repr'] = premise_repr
    data['hypothesis_repr'] = hypothesis_repr

    save_path = os.path.join(train_data_output_dir, f"{data['id']}.pkl")
    save_dict_as_pickle(data , save_path)

In [10]:
start_index = len(os.listdir(dev_data_output_dir))

for data in tqdm(dev_data[start_index:]) :

    premise = data['premise']
    premise_repr = get_sentence_lvl_repr(premise)


    hypothesis = data['hypothesis']
    hypothesis_repr = get_sentence_lvl_repr(hypothesis)

    data['premise_repr'] = premise_repr
    data['hypothesis_repr'] = hypothesis_repr

    save_path = os.path.join(dev_data_output_dir, f"{data['id']}.pkl")
    save_dict_as_pickle(data , save_path)

  0%|          | 0/1686 [00:00<?, ?it/s]

Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences
Encountered sent with >5 sentences


In [11]:
len(os.listdir(train_data_output_dir))

50000

In [12]:
len(os.listdir(dev_data_output_dir))

10000