## Dependency Parsing as a Preprocessing Step for Logical Reasoning

In [1]:
# Installs HBOX for Jupyer Notebooks
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !jupyter nbextension enable --py widgetsnbextension
# drive_path = ''

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/FinalProject/'
!pip install transformers

Mounted at /content/drive
Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 21.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 465 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, 

In [2]:
import os
import gc
import re
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import RobertaTokenizer
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaConfig, RobertaForSequenceClassification, AdamW

# Enable CUDA Blocking Debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
CUDA_LAUNCH_BLOCKING="1"

# Print GPU Information
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
else:
    print(gpu_info)

Wed Dec 15 16:22:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### Load Data

In [3]:
def load_data(path, test):
    df = pd.DataFrame({'prompt': [], 'label': []})
    data = json.load(open(path))
    for i in range(len(data['context'])):
        # Add BERT tokens to prompt
        prompt = data['context'][str(i)] + ' </s> ' + data['question'][str(i)] + ' </s> '# + data['dep_context'][str(i)] + ' </s> '
        for j in range(4):
            # Add BERT tokens to answer
            answer = data['answers'][str(i)][j]# + ' <d> ' + data['dep_answers'][str(i)][j] + ' <s>'
            
            # Attach 0, 1 label as array
            label = [1] if not test and j == data['label'][str(i)] else [0]
            
            # Append question, answer pair to dataframe
            df = df.append({'prompt': prompt + answer, 'label': [label]}, ignore_index=True)            
    return df

In [6]:
train_data = load_data(drive_path + 'reclor_data_with_dependencies/train.json', False)
val_data = load_data(drive_path + 'reclor_data_with_dependencies/val.json', False)
test_data = load_data(drive_path + 'reclor_data_with_dependencies/test.json', True)

In [7]:
# Describe the token lengths of training data
train_data['prompt'].apply(lambda x: len(re.findall(r'\w+', x))).describe()

count    18552.000000
mean       103.285630
std         23.085243
min         36.000000
25%         88.000000
50%        102.000000
75%        116.000000
max        230.000000
Name: prompt, dtype: float64

#### Initialize tokenizer

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.add_special_tokens({'additional_special_tokens': ['<d>']})

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

1

#### Tokenize the data

In [9]:
# Creates a dataloader (which includes an attention mask)
def preprocess(in_, tokenizer, max_len, batch_size, data_class='train'):
    encoded_input = tokenizer(in_['prompt'].values.tolist(), padding=True, max_length=max_len, truncation=True, return_tensors="pt")
    
    if data_class != 'test':
        labels = torch.tensor(in_['label'].values.tolist())
    dataset_tensor = TensorDataset(encoded_input['input_ids'], encoded_input['attention_mask'], labels)
    sampler = SequentialSampler(dataset_tensor)
    #sampler = RandomSampler(dataset_tensor) if data_class == "train" else SequentialSampler(dataset_tensor)
    dataloader = DataLoader(dataset_tensor, sampler=sampler, batch_size=batch_size)
    return dataloader

In [10]:
max_len = 512 # should be 1024
batch_size = 4

train_dataloader = preprocess(train_data, tokenizer, max_len, batch_size)
val_dataloader = preprocess(val_data, tokenizer, max_len, batch_size, data_class="val")
# test_dataloader = preprocess(test_data, tokenizer, max_len, batch_size, data_class="test")

In [None]:
# fi = iter(val_dataloader)
# for i in range(100):
#   l = next(fi)
#   #print(l)
#   #print(l.numpy().tolist()[0])
#   k = tokenizer.decode(l.numpy().tolist()[0])
#   print(k)
#   j = tokenizer.decode(l.numpy().tolist()[1])
#   print(j)
#   print(l)
# print(next(iter(val_dataloader)))
# print(train_data)

# fi = iter(val_dataloader)
# for i in range(10):
#   l = next(fi)
#   print(l)
#   print(tokenizer.decode(l[0]))
#   print(tokenizer.decode(l[1]))


for step, batch in enumerate(val_dataloader):
  labels = batch[0]
  print(tokenizer.decode(labels.numpy().tolist()[0]))
  print(tokenizer.decode(labels.numpy().tolist()[1]))

#### Train and Evaluate RoBERTa

In [11]:
def ClearTorch():
    torch.no_grad()
    torch.cuda.empty_cache()
    gc.collect()
    
def Eval(model, dataloader):
    ClearTorch()
    model.eval()
    predictions, true_labels = [], []
    for step, batch in enumerate(tqdm(dataloader)):
        # Call model on batch
        input_ids, attention_mask, labels = batch[0].cuda(), batch[1].cuda(), batch[2].cuda()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Convert output logit to predictions using softmax
        #print(labels)

        predictions.append(torch.nn.functional.softmax(outputs.logits).argmax(0)[1].cpu().numpy().tolist())
        true_labels.append(labels.argmax(0).cpu().numpy().tolist()[0][0])
      
        ClearTorch()

    return float(sum([predictions[i] == true_labels[i] for i in range(len(predictions)) ])) / float(len(predictions))

def Train(model, train_data, lr, n_epoch):
    ClearTorch()
    optimizer = AdamW(model.parameters(), lr=lr)

    for epoch in range(n_epoch):
        print(f"Epoch {epoch}")
        model.train()
        nb_tr_examples, nb_tr_steps, tr_loss = 0, 0, 0

        for step, batch in enumerate(tqdm(train_data)):
            # RoBERTa fine-tuning
            input_ids, attention_mask, labels = batch[0].cuda(), batch[1].cuda(), batch[2].cuda()
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            outputs.loss.backward()
            optimizer.step()
            
            ClearTorch()
            
            tr_loss += float(outputs.loss)
            nb_tr_steps += 1
            
        print(f"Train loss on epoch {epoch}: {tr_loss / nb_tr_steps}\n")

In [12]:
ClearTorch()

config = RobertaConfig.from_pretrained('roberta-base')
# config.max_position_embeddings = max_len
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    model.to(device)
    model = torch.nn.DataParallel(model)
else:
    model.cuda()
    
learning_rate = 2e-4
num_epoch = 1

Train(model, train_dataloader, learning_rate, num_epoch)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch 0


100%|██████████| 4638/4638 [1:07:25<00:00,  1.15it/s]


Train loss on epoch 0: 0.5659007900629438

Epoch 1


100%|██████████| 4638/4638 [1:07:00<00:00,  1.15it/s]


Train loss on epoch 1: 0.5647305440303836

Epoch 2


100%|██████████| 4638/4638 [1:06:33<00:00,  1.16it/s]

Train loss on epoch 2: 0.5647531466630061






In [13]:
val_dataloader = preprocess(val_data, tokenizer, max_len, batch_size)#, data_class="val")
print(f"Accuracy: {Eval(model, val_dataloader)}")

100%|██████████| 500/500 [03:07<00:00,  2.66it/s]

Accuracy: 0.314



