In [1]:
from pathlib import Path
from torch.utils.data import DataLoader
import torch
import torch.optim as optim
import pandas as pd
import json
import tqdm
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset, DatasetDict
import evaluate
import os
from pprint import pprint

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
NUM_EPOCHS = 50
EXPERIMENT_NAME = "t5-small_falcon2-1eg0r"
EXPERIMENT_DIR = Path('experiments')
MODEL_ARTIFACTS = EXPERIMENT_DIR / EXPERIMENT_NAME
WEIGHTS_DIR = MODEL_ARTIFACTS / 'weights'
VALS_DIR = MODEL_ARTIFACTS / 'validations'
LINKS_PATH = 'falcon_links/1ents-gold_0rels/link_28246.json'

Make appropriate directoreis

In [2]:
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
VALS_DIR.mkdir(parents=True, exist_ok=True)

Defining the model and tokenizer

In [3]:
model_path = "t5-small"
tokenizer_path = "t5-small"

model = T5ForConditionalGeneration.from_pretrained(model_path, device_map ='auto')
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)

In [4]:
from pprint import pprint
pprint(model.hf_device_map)

{'': 0}


Define dataset maker

In [5]:
def split_dataframe(df):
  # ratios from Bannerjee
  train = 0.7
  dev = 0.1
  test = 0.2
  assert train + dev + test == 1.0
  data_len = len(df)
  train_set = Dataset.from_pandas(df[:round(data_len * train)])
  dev_set = Dataset.from_pandas(df[round(data_len * train):round(data_len* (train + dev))])
  test_set = Dataset.from_pandas(df[round(data_len * (train + dev)):])
  
  dataset = DatasetDict()
  dataset['train'] = train_set
  dataset['dev'] = dev_set
  dataset['test'] = test_set

  return dataset

Define dataset tokenizer

In [6]:
def tokenize_data(dataset, column):
  model_inputs = tokenizer(dataset[column], padding=True, truncation=True, return_tensors="pt")
  return model_inputs

Define unmasker

In [7]:
from pipeline import T5Converter
converter = T5Converter()

Defining the validation function

In [8]:
def val(val_dataloader, val_path = None):
  model.eval()
  eval_dict = []

  iters = len(val_dataloader)

  # progress_bar = tqdm.tqdm(iters, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}")
  # progress_bar.set_description(f"Eval")

  correct_preds = 0
  total_preds = 0

  for val_batch in val_dataloader:
    batch = {}
    for k,v in val_batch.items():
      if k in {"input_ids", "labels", "attention_mask"}:
        batch[k] = v.to("cuda")

    with torch.no_grad():
      outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for i, pred in enumerate(tokenizer.batch_decode(predictions)):
      gold = val_batch['gold'][i]
      gold = gold.strip().replace(" ","")
      gold2 = gold.replace(">", "> ").replace("<"," <").replace("  ", " ").strip()
      pred = pred.replace(" ","").replace("</s>", "").replace("<pad>","").replace('<unk>','').replace('<s>','').strip().replace(" ","")
      pred2 = pred.replace(">", "> ").replace("<"," <").replace("  ", " ").strip()
      entry_dict = {
        "Utte": val_batch['utterance'][i],
        "Anno": val_batch['annotated'][i],
        "Gold": val_batch['gold'][i],
        "Gene": pred, # THIS NEEDS TO BE UNMASKED
        "Gol2": converter._unmask_generic(gold2),
        "Gen2": converter._unmask_generic(pred2),
      }
      eval_dict.append(entry_dict)
      total_preds += 1
      if entry_dict['Gol2'] == entry_dict['Gen2']:
        correct_preds += 1
    # progress_bar.update(1)
  
  if val_path:
    with open(val_path, "w") as f:
      json.dump(eval_dict, f, indent=2)

  accuracy = correct_preds/total_preds

  meta = {
    'accuracy': f"{accuracy:.5f}"
  }
  
  model.train()
  return eval_dict, meta

In [9]:
def training_loop(df):
  print("beginning training")

  assert 'utterance' in df.columns
  assert 'annotated' in df.columns
  assert 'gold' in df.columns

  dataset = split_dataframe(df)
  tokenized_dataset = dataset \
    .map(lambda x: tokenize_data(x, 'gold'), batched=True) \
    .rename_column('input_ids', 'labels') \
    .map(lambda x: tokenize_data(x, 'annotated'), batched=True)

  tokenized_dataset.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)
  print("data loaded")
  
  train_dataset = tokenized_dataset["train"]
  dev_dataset = tokenized_dataset["dev"]
  test_dataset = tokenized_dataset["test"]

  train_dataloader = DataLoader(train_dataset, batch_size = 10)
  dev_dataloader = DataLoader(dev_dataset, batch_size = 10)

  scalar = 0

  optimizer = optim.AdamW(model.parameters(), lr = 0.0015)
  lr_scheduler=transformers. \
    get_polynomial_decay_schedule_with_warmup(optimizer, 5000, 30000, power=0.5)
  
  epoch_data = {}

  for epoch in range(NUM_EPOCHS):
    print("\nBeginning Epoch:", epoch)
    i = 0
    iters = len(train_dataloader)
    for batch in train_dataloader:
      newbatch = {}
      for k,v in batch.items():
        if k in ["labels", "input_ids", "attention_mask"]:
          newbatch[k] = v.to("cuda")
      
      batch = newbatch
      newbatch = {}

      outputs = model(**batch)
      loss = outputs.loss
      scalar += loss.mean().item()

      if (i+1) % 100 == 0:
        print(f'iteration = {i+1}/{iters}, training loss={scalar/100}')
        scalar = 0

      loss /= 10 
      loss.mean().backward()
      if (i+1) % 1 == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
      
      del batch
      i += 1
    
    print(f"Validating epoch {epoch}")
    val_filename = f"val_{epoch}.json"
    _, meta = val(dev_dataloader, VALS_DIR / val_filename)
    pprint(meta)
    assert val_filename not in epoch_data
    epoch_data[val_filename] = meta

    with open(MODEL_ARTIFACTS / "meta_data.json", "w") as f:
      json.dump(epoch_data, f, indent=2)

    torch.save(model.state_dict(),
      WEIGHTS_DIR / f"cp_{epoch}.pth")
  print(f"\nValidating final")
  val_filename = f"val_final.json"

  _, meta = val(dev_dataloader, VALS_DIR / val_filename)

  epoch_data[val_filename] = meta

  pprint(meta)
  with open(MODEL_ARTIFACTS / "meta_data.json", "w") as f:
    json.dump(epoch_data, f, indent=2)

  torch.save(model.state_dict(),
    WEIGHTS_DIR / f"cp_final.pth")

Main

In [10]:
df_json = []
with open(LINKS_PATH) as f:
  data_json = json.load(f)

print(data_json[0])


[{'utterance': 'What periodical literature does Delta Air Lines use as a moutpiece?', 'ents': [{'uri': 'http://www.wikidata.org/entity/Q1002697', 'prefix': 'wd:', 'id': 'Q1002697'}, {'uri': 'http://www.wikidata.org/entity/Q188920', 'prefix': 'wd:', 'id': 'Q188920'}, {'uri': 'http://www.wikidata.org/entity/Q416938', 'prefix': 'wd:', 'id': 'Q416938'}], 'rels': []}, {'utterance': 'What periodical literature does Delta Air Lines use as a moutpiece?', 'fragments': ['[DEF]', 'wd:', 'Q1002697 periodical literature', '[DEF]', 'wd:', 'Q188920 Delta', '[DEF]', 'wd:', 'Q416938 Mouthpiece']}, {'inputs': 'What periodical literature does Delta Air Lines use as a moutpiece? <extra_id_59> <extra_id_53> Q1002697 periodical literature <extra_id_59> <extra_id_53> Q188920 Delta <extra_id_59> <extra_id_53> Q416938 Mouthpiece', 'labels': '<extra_id_6> <extra_id_21> <extra_id_39> <extra_id_19> <extra_id_33> <extra_id_53> q188920 <extra_id_54> p2813 <extra_id_39> <extra_id_38> <extra_id_39> <extra_id_54> p31 

In [11]:
# df_json = []
# with open('weekend.json') as f:
#   data_json = json.load(f)

for data in data_json:
  data_dict = {
    "utterance": data[0]["utterance"],
    "annotated": data[2]["inputs"],
    "gold": data[2]["labels"]
  }
  df_json.append(data_dict)

In [12]:
df = pd.DataFrame.from_dict(df_json)
df.head()

Unnamed: 0,utterance,annotated,gold
0,What periodical literature does Delta Air Line...,What periodical literature does Delta Air Line...,<extra_id_6> <extra_id_21> <extra_id_39> <extr...
1,Who is the child of Ranavalona Is husband?,Who is the child of Ranavalona Is husband? <ex...,<extra_id_6> <extra_id_39> <extra_id_19> <extr...
2,Is it true Jeff_Bridges occupation Lane Chandl...,Is it true Jeff_Bridges occupation Lane Chandl...,<extra_id_4> <extra_id_19> <extra_id_33> <extr...
3,What is the pre-requisite of phase matter of G...,What is the pre-requisite of phase matter of G...,<extra_id_6> <extra_id_39> <extra_id_19> <extr...
4,Which is the operating income for Qantas?,Which is the operating income for Qantas? <ext...,<extra_id_6> <extra_id_21> <extra_id_39> <extr...


In [13]:
training_loop(df)

beginning training


Map:   0%|          | 0/19771 [00:00<?, ? examples/s]

Map:   0%|          | 0/2824 [00:00<?, ? examples/s]

Map:   0%|          | 0/5649 [00:00<?, ? examples/s]

Map:   0%|          | 0/19771 [00:00<?, ? examples/s]

Map:   0%|          | 0/2824 [00:00<?, ? examples/s]

Map:   0%|          | 0/5649 [00:00<?, ? examples/s]

data loaded

Beginning Epoch: 0


iteration = 100/1978, training loss=11.375450992584229


iteration = 200/1978, training loss=4.416922595500946


iteration = 300/1978, training loss=2.117065905332565


iteration = 400/1978, training loss=1.2882678866386414


iteration = 500/1978, training loss=1.086832088828087


iteration = 600/1978, training loss=0.9539824879169464


iteration = 700/1978, training loss=0.8412085312604904


iteration = 800/1978, training loss=0.7387849116325378


iteration = 900/1978, training loss=0.650204561650753


iteration = 1000/1978, training loss=0.5742827352881431


iteration = 1100/1978, training loss=0.5264252337813378


iteration = 1200/1978, training loss=0.47143656194210054


iteration = 1300/1978, training loss=0.4158672150969505


iteration = 1400/1978, training loss=0.4059744003415108


iteration = 1500/1978, training loss=0.34937031492590903


iteration = 1600/1978, training loss=0.32352416336536405


iteration = 1700/1978, training loss=0.29328452825546264


iteration = 1800/1978, training loss=0.2714674296975136


iteration = 1900/1978, training loss=0.11165088333189488


Validating epoch 0
