In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%pwd
%cd /content/gdrive/'My Drive'/eswc-nmvls2023
%ls -la

In [None]:
# Install library using pip
!pip install --quiet transformers==4.1.1
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94
!pip install --quiet tqdm

In [None]:
# Check we have a GPU and check the memory size of the GUP
!nvidia-smi

Fri Dec 16 08:27:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# import libraries
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("new_tb_logs", name="new_model")

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    T5Config
)
pl.seed_everything(42)

INFO:lightning_lite.utilities.seed:Global seed set to 42


42

# Datasets preparation

# LIMES Silver Dataset

In [None]:
limes_silver_train = pd.read_table("datasets/limes-silver/train.txt", sep='\t')
limes_silver_dev = pd.read_table("datasets/limes-silver/dev.txt", sep='\t')

In [None]:
train = limes_silver_train
dev = limes_silver_dev

# LIMES Annotated Dataset

In [None]:
limes_annotated_train = pd.read_table("datasets/limes-annotated/train.txt", sep='\t')
limes_annotated_dev = pd.read_table("datasets/limes-annotated/dev.txt", sep='\t')

# Combine LIMES Silver and Annotated

In [None]:
train_frame = [limes_silver_train, limes_annotated_train]
train = pd.concat(train_frame)
dev_frame = [limes_silver_dev, limes_annotated_dev]
dev = pd.concat(dev_frame)

# LIMES manipulated dataset

In [None]:
limes_manipulated_train = pd.read_table("datasets/limes-manipulated/train.txt", sep='\t')
limes_manipulated_dev = pd.read_table("datasets/limes-manipulated/dev.txt", sep='\t')

# Combine LIMES Silver, Annotated and Manipulated

In [None]:
train_frame = [limes_silver_train, limes_annotated_train, limes_manipulated_train]
train = pd.concat(train_frame)
dev_frame = [limes_silver_dev, limes_annotated_dev, limes_manipulated_dev]
dev = pd.concat(dev_frame)

# SILK Human Annotated dataset

In [None]:
silk_train = pd.read_table("datasets/silk-human-annotated/train.txt", sep='\t')
silk_dev = pd.read_table("datasets/silk-human-annotated/dev.txt", sep='\t')

# Combine LIMES Silver, Annotated, Manipulated and SILK

In [None]:
train_frame = [limes_silver_train, limes_annotated_train, limes_manipulated_train, silk_train]
train = pd.concat(train_frame)
dev_frame = [limes_silver_dev, limes_annotated_dev, limes_manipulated_dev, silk_dev]
dev = pd.concat(dev_frame)

# Dataset overview

In [None]:
train_datasets = list(train.itertuples(index=False, name=None))
print(train_datasets[0])
print(len(train_datasets))

('AND(OR(cosine(x.givenName,y.streetName)|0.45,AND(OR(qgrams(x.givenName,y.streetName)|0.25,jaroWinkler(x.streetName,y.streetName)|0.45)|0.45,ratcliff(x.givenName,y.streetName)|0.25)|0.45)|0.45,qgrams(x.streetName,y.streetName)|0.25)', 'a link will be generated if the givenName of the source and the streetName of the target have a Cosine similarity of 45% or a Qgrams similarity of 25% or the streetNames of the source and the target have a Jarowinkler similarity of 45% and the givenName of the source and the streetName of the target have a Ratcliff similarity of 25% and the streetNames of the source and the target have a Qgrams similarity of 25%')
10500


In [None]:
dev_datasets = list(dev.itertuples(index=False, name=None))
print(dev_datasets[0])
print(len(dev_datasets))

('AND(AND(ratcliff(x.givenName,y.givenName)|0.62,AND(AND(mongeElkan(x.givenName,y.description)|0.25,jaccard(x.givenName,y.givenName)|0.62)|0.62,ratcliff(x.givenName,y.givenName)|0.25)|0.25)|0.62,mongeElkan(x.givenName,y.givenName)|0.25)', 'a link will be generated if the givenNames of the source and the target have a Ratcliff similarity of 62% and the givenName of the source and the description of the target have a Mongeelkan similarity of 25% and the givenNames of the source and the target have a Jaccard similarity of 62% and a Ratcliff similarity and a Mongeelkan similarity of a 25%')
1485


# Model

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from tqdm.notebook import tqdm
import copy

class LSVerbalizeDataset(Dataset):
    def __init__(self, tokenizer, tf_list, max_len_inp=256,max_len_out=256):

        self.ls_datasets = tf_list

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self.skippedcount =0
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for inputs,outputs in self.ls_datasets:
          input_sent = "verbalize ls: "+inputs
          ouput_sent = outputs

          # tokenize inputs
          tokenized_inputs = self.tokenizer.batch_encode_plus(
              [input_sent], max_length=self.max_len_input, pad_to_max_length=True, return_tensors="pt"
          )
          # tokenize targets
          tokenized_targets = self.tokenizer.batch_encode_plus(
              [ouput_sent], max_length=self.max_len_output, pad_to_max_length=True,return_tensors="pt"
          )

          self.inputs.append(tokenized_inputs)
          self.targets.append(tokenized_targets)

In [None]:
train_dataset = LSVerbalizeDataset(t5_tokenizer, train_datasets)
validation_dataset = LSVerbalizeDataset(t5_tokenizer, dev_datasets)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, argparams, t5model, t5tokenizer):
    super(T5FineTuner, self).__init__()
    self.argparams = argparams
    self.model = t5model
    self.tokenizer = t5tokenizer
  def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=decoder_attention_mask,
        labels=lm_labels,
        output_attentions=True
    )
    return outputs
  def training_step(self, batch, batch_idx):
    outputs = self.forward(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        decoder_input_ids = batch["target_ids"],
        decoder_attention_mask=batch['target_mask'],
        lm_labels=batch['labels']
    )
    loss = outputs[0]
    self.log('train_loss',loss)
    return loss
  def validation_step(self, batch, batch_idx):
    outputs = self.forward(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        decoder_input_ids = batch["target_ids"],
        decoder_attention_mask=batch['target_mask'],
        lm_labels=batch['labels']
    )
    loss = outputs[0]
    self.log("val_loss",loss)
    return loss
  def train_dataloader(self):
    return DataLoader(train_dataset, batch_size=self.argparams.batch_size, num_workers=4)
  def val_dataloader(self):
    return DataLoader(validation_dataset, batch_size=self.argparams.batch_size, num_workers=4)
  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=3e-5, eps=1e-8)
    return optimizer
  def save_core_model(self):
    self.model.save_pretrained("models")
    self.tokenizer.save_pretrained("models")

# Model training

In [None]:
import argparse

args_dict = dict(
    batch_size=8,
)
args = argparse.Namespace(**args_dict)

model = T5FineTuner(args, t5_model, t5_tokenizer)
model.to("cuda")
trainer = pl.Trainer(min_epochs=3, max_epochs = 5, gpus=1, logger=logger)
trainer.fit(model)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [None]:
model.save_core_model()

In [None]:
%load_ext tensorboard
%tensorboard --logdir new_tb_logs/

## Testing dataset

In [None]:
#limes annotated
test_df = pd.read_table("datasets/limes-annotated/test.txt", sep='\t')
test = test_df
len(test)

In [None]:
# limes silver
test_df = pd.read_table("datasets/limes-silver/test.txt", sep='\t')
test = test_df[:200]
len(test)

In [None]:
# limes manipulated
test_df = pd.read_table("datasets/limes-manipulated/test.txt", sep='\t')
test = test_df
len(test)

In [None]:
# SILK human annotated
test_df = pd.read_table("datasets/silk-human-annotated/test.txt", sep='\t')
test = test_df
len(test)

# Load the existing model

In [None]:
# if the model is already exists
import argparse

args_dict = dict(
    batch_size=1,
)
args = argparse.Namespace(**args_dict)
t5_model.load_state_dict(torch.load("models/model/pytorch_model.bin"))
model = T5FineTuner(args, t5_model, t5_tokenizer)
model.to("cuda")
model.model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

# Predictions

In [None]:
from tqdm import tqdm
new_model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
model.model.eval()
with open("results.txt", 'w', encoding='utf-8') as f_out:
  pass
scores = []
print('START Generating predictions')
for idx, input in tqdm(enumerate(test['0'])):
  test_sent = f'verbalize ls: {input} </s>'
  test_tokenized = t5_tokenizer.encode_plus(test_sent, return_tensors="pt")

  test_input_ids  = test_tokenized["input_ids"]
  test_attention_mask = test_tokenized["attention_mask"]

  beam_outputs = model.model.generate(
    input_ids=test_input_ids.to("cuda"),
    attention_mask=test_attention_mask.to("cuda"),
    max_length=256,
    early_stopping=True,
    num_beams=15,
    num_return_sequences=1,
    no_repeat_ngram_size=6
  )
  sent = t5_tokenizer.decode(beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
  with open("results.txt", 'a', encoding='utf-8') as f_out:
    f_out.write(f"{sent}\n")
print('END Generating predictions')
print('START Generating labels')
with open("labels.txt", "w", encoding="utf-8") as f_labels:
  pass
for label in tqdm(test['1']):
  with open("labels.txt", "a", encoding="utf-8") as f_labels:
    f_labels.write(f"{label}\n")
print('END Generating labels')
print('START Generating inputs')
with open("ls.txt", "w", encoding="utf-8") as f:
  pass
for input in tqdm(test['0']):
  with open("ls.txt", "a", encoding="utf-8") as f:
    f.write(f"{input}\n")
print('END Generating inputs')

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


START Generating predictions


2it [00:07,  3.54s/it]


END Generating predictions
START Generating labels


100%|██████████| 2/2 [00:00<00:00, 562.20it/s]


END Generating labels
START Generating inputs


100%|██████████| 2/2 [00:00<00:00, 519.61it/s]

END Generating inputs



