In [55]:
!pip3 install datasets



In [56]:
import datasets
from datasets import load_dataset

In [57]:
dataset = load_dataset("tverous/anli-amr", split="train")

In [58]:
dataset[0]

{'uid': '2093cfb3-a15f-4282-81e3-0cb793ffd0d7',
 'premise': 'TOKYO, Dec 18 (Reuters) - Japan’s Shionogi & Co said on Tuesday that it has applied to health regulators in the United States, Canada and Europe for approval of its HIV drug Dolutegravir. Shionogi developed Dolutegravir with a Viiv Healthcare, an AIDS drug joint venture between GlaxoSmithKline and Pfizer, in exchange for its rights to the drug.',
 'hypothesis': 'The article was written on December 18th.',
 'label': 0,
 'reason': 'TOKYO, Dec 18 (Reuters) is when the article was written as it states in the first words of the sentence',
 'claim_cleaned_amr': '( z0 write :ARG1 ( z1 article ) :time ( z2 date-entity :day 18 :month 12 ) )',
 'amr_penman': '(z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))',
 'amr_tokens': ['The',
  'article',
  'was',
  'written',
  'on',
  'December',
  '18th',
  '.'],
 'amr_nodes': "{'z1': 'article', 'z0': 'write-01', 'z2': 'dat

In [59]:
dataset.shape

(100459, 11)

In [60]:
dataset.column_names

['uid',
 'premise',
 'hypothesis',
 'label',
 'reason',
 'claim_cleaned_amr',
 'amr_penman',
 'amr_tokens',
 'amr_nodes',
 'amr_alignments',
 'amr_edges']

In [61]:
#Extract amr graphs with text for paired training

def extract_amr_and_text(data):
  amr_text_pairs = []
  for row in data:
    amr_graph = row.get("amr_penman", None)
    text = row.get("hypothesis", None)
    if amr_graph and text:
      amr_text_pairs.append({"amr_graph": amr_graph,"text": text})
  return amr_text_pairs

In [62]:
amr_text_pairs = extract_amr_and_text(dataset)

In [63]:
amr_text_pairs[0]

{'amr_graph': '(z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))',
 'text': 'The article was written on December 18th.'}

In [64]:
amrs = []
text = []

for i in range(len(dataset)):
  amrs.append(amr_text_pairs[i]["amr_graph"])
  text.append(amr_text_pairs[i]["text"])

In [65]:
#Create dataframe with amr_graph and text columns

import pandas as pd

df = pd.DataFrame({"amr_graph": amrs, "text": text})

In [66]:
df.head()

Unnamed: 0,amr_graph,text
0,(z0 / write-01\n :ARG1 (z1 / article)\n ...,The article was written on December 18th.
1,(z0 / urge-01\n :ARG0 (z1 / person\n ...,Gillum was on TV urging residents to stay out ...
2,(z0 / and\n :op1 (z1 / beat-03\n ...,Carlton beat Melbourne in 2016 and will attemp...
3,(z0 / close-01\n :ARG1 (z1 / road)\n :du...,The road was closed for more than two hours af...
4,(z0 / advise-01\n :ARG2 (z1 / slow-down-03)),Its advisible to slow down


In [67]:
import torch
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Trainer, AutoModelForCausalLM
import re
import pandas as pd

In [68]:
#Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [69]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [70]:
small_dataset = dataset.select([i for i in range(25000)])

In [71]:
#Define prompt and answer templates
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: {instruction}\n Response:"""
answer_template = """{response}"""

In [72]:
#Define function to add keys in the dictionary for prompt, answer and combined text
def _add_text(rec):
  instruction = rec["amr_graph"]
  response = rec["text"]

  #Check if both exist; raise error if not
  if not instruction:
    raise ValueError("instruction is missing")
  if not response:
    raise ValueError("Expected a response")

  #Create prompt, answer, combined text
  rec["prompt"] = prompt_template.format(instruction=instruction)
  rec["answer"] = answer_template.format(response=response)
  rec["text"] = rec["prompt"] + rec["answer"]
  return rec

In [73]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

In [74]:
small_dataset = small_dataset.map(_add_text)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [75]:
small_dataset[1]

{'amr_graph': '(z0 / urge-01\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Gillum")\n              :medium (z3 / television))\n    :ARG1 (z4 / person\n              :ARG0-of (z5 / reside-01))\n    :ARG2 (z6 / stay-01\n              :ARG1 z4\n              :ARG3 (z7 / out-06\n                        :ARG1 z4\n                        :ARG2 (z8 / storm))))',
 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: (z0 / urge-01\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Gillum")\n              :medium (z3 / television))\n    :ARG1 (z4 / person\n              :ARG0-of (z5 / reside-01))\n    :ARG2 (z6 / stay-01\n              :ARG1 z4\n              :ARG3 (z7 / out-06\n                        :ARG1 z4\n                        :ARG2 (z8 / storm))))\n Response:Gillum was on TV urging residents to stay out of the storm.',
 'prompt': 'Below

In [76]:
tokenizer.pad_token = tokenizer.eos_token

In [77]:
MAX_LENGTH = 256

In [78]:
from typing import Dict, List
import copy
def preprocess_batch(batch: Dict[str,List]):
  model_inputs = tokenizer(batch["text"], padding='max_length', truncation=True, max_length=MAX_LENGTH)
  model_inputs["labels"] = copy.deepcopy(model_inputs["input_ids"])
  return model_inputs

In [79]:
from functools import partial
preprocessing_function = partial(preprocess_batch)

In [80]:
#Define split ratios

train_test_split = small_dataset.train_test_split(test_size=0.2) #20% as test
train_valid_split = train_test_split["train"].train_test_split(test_size=0.1) #From train, split 10% as validation

In [81]:
from datasets import DatasetDict
dataset_dict = DatasetDict({
    "train": train_valid_split["train"],
    "validation": train_valid_split["test"],
    "test": train_test_split["test"]
})

In [82]:
print(f"Train set size: {len(dataset_dict['train'])}")
print(f"Validation set size: {len(dataset_dict['validation'])}")
print(f"Test set size: {len(dataset_dict['test'])}")


Train set size: 18000
Validation set size: 2000
Test set size: 5000


In [83]:
# Example check for first item in each split
print("Sample from train:", dataset_dict['train'][0])
print("Sample from validation:", dataset_dict['validation'][0])
print("Sample from test:", dataset_dict['test'][0])

Sample from train: {'amr_graph': '(z0 / sophomore\n    :domain (z1 / girl\n                :topic-of (z2 / article))\n    :location (z3 / college))', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: (z0 / sophomore\n    :domain (z1 / girl\n                :topic-of (z2 / article))\n    :location (z3 / college))\n Response:The girl in the article is a sophomore in college ', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: (z0 / sophomore\n    :domain (z1 / girl\n                :topic-of (z2 / article))\n    :location (z3 / college))\n Response:', 'answer': 'The girl in the article is a sophomore in college '}
Sample from validation: {'amr_graph': '(z0 / keep-02\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Tom"))\n    :ARG1 (z3 / talk-01\n              :ARG0 z1\n              :ARG2 (z4 /

In [84]:
# Apply the preprocessing function to each batch in the dataset
encoded_train_dataset = dataset_dict['train'].map(
    preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)

encoded_validation_dataset = dataset_dict['validation'].map(
    preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)

encoded_test_dataset = dataset_dict['test'].map(
    preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)
processed_train_dataset = encoded_train_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
processed_validation_dataset = encoded_validation_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
processed_test_dataset = encoded_test_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [85]:
processed_test_dataset.shape

(5000, 3)

In [86]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='/mnt/disks/disk1/results',  ## give the directory name where you want to save the model
    evaluation_strategy='epoch',
    report_to="none",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # Accumulate gradients for 4 steps
    warmup_steps=50,
    learning_rate=5e-5,        # Lowered learning rate
    weight_decay=0.1,          # Reduced weight decay to prevent over-penalizing weights
    logging_dir='/mnt/disks/disk1/logs' ## give the directory name where you want to save the model
)



In [87]:
from transformers import DataCollatorForLanguageModeling
# Initialize the data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [88]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_validation_dataset,
    data_collator=data_collator
)

In [89]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.3808,0.325185
2,1.1913,0.305524
3,1.074,0.300351


TrainOutput(global_step=13500, training_loss=1.3143926595052082, metrics={'train_runtime': 1869.0951, 'train_samples_per_second': 28.891, 'train_steps_per_second': 7.223, 'total_flos': 7054884864000000.0, 'train_loss': 1.3143926595052082, 'epoch': 3.0})

In [90]:
# Save the model and tokenizer explicitly
model_output_dir = '/mnt/disks/disk1/results'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

('/mnt/disks/disk1/results/tokenizer_config.json',
 '/mnt/disks/disk1/results/special_tokens_map.json',
 '/mnt/disks/disk1/results/vocab.json',
 '/mnt/disks/disk1/results/merges.txt',
 '/mnt/disks/disk1/results/added_tokens.json',
 '/mnt/disks/disk1/results/tokenizer.json')

In [91]:
def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

In [92]:
import re

def main(input_text):
    model_path = '/mnt/disks/disk1/results'
    model = AutoModelForCausalLM.from_pretrained(model_path)

    #Calculate the number of parameters in the model being used for inference
    num_params = get_model_parameters(model)
    #print(f"Number of parameters in the model: {num_params}")

    #Prepare input text for generation

    inputs = tokenizer(input_text, return_tensors="pt")

    #Generate text
    outputs = model.generate(**inputs, max_length = 500, num_return_sequences=1)

    #Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    #Extract only the response part by splitting based on "Response:"

    match = re.search(r"Response:\s*(.*)", generated_text)
    if match:
        response_text = match.group(1)

        #Remove extra spaces between sentences
        response_text = re.sub(r'\s{2,}', ' ', response_text)

        #Keep only up to the first sensible sentence-ending puntuation

        response_text = re.split(r'[.!?]', response_text)[0].strip() + '.'
        print("Response text:", response_text)
        return response_text
    else:
        return "No response found in the generated text."



In [93]:
# Example input for inference
example_input = """
(z0 / easy-05
    :ARG1 (z1 / scare-01
              :ARG1 (z2 / person
                        :ARG0-of (z3 / have-rel-role-91
                                     :ARG1 (z4 / i)
                                     :ARG2 (z5 / uncle))))
    :mod (z6 / certain))
"""
output = main(example_input)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: My uncle was certain I was scared.


In [94]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(predicted_text, ground_truth_text):

  #Tokenize the texts into lists of words

  reference = [ground_truth_text.split()]
  hypothesis = predicted_text.split()
  if not hypothesis:
    return 0.0

  #Calc BLEU score with smoothing

  smoothie = SmoothingFunction().method4 #For short texts
  bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=smoothie)

  return bleu_score


bleu_score = 0
valid_count = 0
k = 10

#Loop through dataset

for i in range(k):
  example_input = dataset_dict["test"][i]["amr_graph"]
  ground_truth_text = dataset_dict["test"][i]["answer"]

  #Tokenize and check input length
  tokenized_input = tokenizer(example_input, return_tensors='pt')

  input_length = tokenized_input['input_ids'].shape[1]


  if input_length >500:
    continue


  #Generate model output and calculate BLEU score
  model_output_text = main(example_input)
  bleu = calculate_bleu(model_output_text, ground_truth_text)

  # Only add BLEU score if it’s valid (greater than zero)
  if bleu > 0:
      bleu_score += bleu
      valid_count += 1  # Increment count of valid scores

# Calculate the average BLEU score only if there are valid scores
if valid_count > 0:
    avg_bleu_score = bleu_score / valid_count
else:
    avg_bleu_score = 0.0  # Set average to zero if no valid scores were found

print("Average BLEU score:", avg_bleu_score)





Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: This person said he was subpoenaing reporters for testimony :ARG1 (z12 / person.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: Morton Halperin's home was tapped.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: Pelopias became the same mother and sister on the same day.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: Brown fields are rising.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: Maryland gallon is published.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: SPEED X is not an old rock band.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: The 2017 Youngstown State Penguins fans have a fan named george.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: The media wait was left on the media.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response text: The volcano did not erupt before.
Response text: preparing for a girl's protein powder requires making a girl's protein powder.
Average BLEU score: 0.2820280020739547
