In [None]:
!pip install sacrebleu rouge-score

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: rouge-scor

In [None]:
male_names_dir = "/content/drive/MyDrive/cpsc532/male_names.csv"
female_names_dir = "/content/drive/MyDrive/cpsc532/female_names.csv"
unisex_names_dir = "/content/drive/MyDrive/cpsc532/unisex_names.csv"
events_dir = "/content/drive/MyDrive/cpsc532/events_extracted_processed.txt"

model_dir = "/content/drive/MyDrive/cpsc532/comet-atomic_2020_BART"

output_dir = "/content/drive/MyDrive/cpsc532/output"

# Script from COMET-ATOMIC

Code from https://github.com/allenai/comet-atomic-2020/blob/master/models/comet_atomic2020_bart/generation_example.py


**You need to put the utils.py (https://github.com/allenai/comet-atomic-2020/blob/master/models/comet_atomic2020_bart/utils.py) in the same directory**

In [None]:
# code from https://github.com/allenai/comet-atomic-2020/blob/master/models/comet_atomic2020_bart/generation_example.py
# put the utils.py (https://github.com/allenai/comet-atomic-2020/blob/master/models/comet_atomic2020_bart/utils.py) in the same directory

import json
import torch
import argparse
from tqdm import tqdm
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


class Comet:
    def __init__(self, model_path):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        task = "summarization"
        use_task_specific_params(self.model, task)
        self.batch_size = 1
        self.decoder_start_token_id = None

    def generate(
            self,
            queries,
            decode_method="beam",
            num_generate=5,
            ):

        with torch.no_grad():
            examples = queries

            decs = []
            for batch in list(chunks(examples, self.batch_size)):

                batch = self.tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length").to(self.device)
                input_ids, attention_mask = trim_batch(**batch, pad_token_id=self.tokenizer.pad_token_id)

                summaries = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_start_token_id=self.decoder_start_token_id,
                    num_beams=num_generate,
                    num_return_sequences=num_generate,
                    )

                dec = self.tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                decs.append(dec)

            return decs


all_relations = [
    "AtLocation",
    "CapableOf",
    "Causes",
    "CausesDesire",
    "CreatedBy",
    "DefinedAs",
    "DesireOf",
    "Desires",
    "HasA",
    "HasFirstSubevent",
    "HasLastSubevent",
    "HasPainCharacter",
    "HasPainIntensity",
    "HasPrerequisite",
    "HasProperty",
    "HasSubEvent",
    "HasSubevent",
    "HinderedBy",
    "InheritsFrom",
    "InstanceOf",
    "IsA",
    "LocatedNear",
    "LocationOfAction",
    "MadeOf",
    "MadeUpOf",
    "MotivatedByGoal",
    "NotCapableOf",
    "NotDesires",
    "NotHasA",
    "NotHasProperty",
    "NotIsA",
    "NotMadeOf",
    "ObjectUse",
    "PartOf",
    "ReceivesAction",
    "RelatedTo",
    "SymbolOf",
    "UsedFor",
    "isAfter",
    "isBefore",
    "isFilledBy",
    "oEffect",
    "oReact",
    "oWant",
    "xAttr",
    "xEffect",
    "xIntent",
    "xNeed",
    "xReact",
    "xReason",
    "xWant",
    ]


# Generate Inferences

In [None]:
import random
import pandas as pd
import torch
import os
import numpy as np
import datetime
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Control the randomness

In [None]:
def set_seed(seed: int = 42):
    """Function to control randomness in the code."""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Load the model (GPT or BART)

In [None]:
print("model loading ...")
comet = Comet(model_dir)
comet.model.zero_grad()
print("model loaded")

model loading ...
model loaded


Load files

In [None]:
def load_names(file_path):
    """Load names from a CSV file assuming each name is in a single column."""
    return pd.read_csv(file_path, header=None)[0].tolist()[1:]

def load_events(file_path):
    """Load events from a text file and remove numbering."""
    with open(file_path, 'r', encoding='utf-8') as file:
        events = [line.split(". ", 1)[1].strip() for line in file if ". " in line]
    return events

def assign_names_to_events(names, events):
  assigned_events = []
  for event in events:
    name = random.choice(names)
    assigned_event = f"{name} {event}"
    assigned_events.append(assigned_event)
  return assigned_events



In [None]:
male_names = load_names(male_names_dir)
female_names = load_names(female_names_dir)
unisex_names = load_names(unisex_names_dir)
events = load_events(events_dir)


Get and store the inferences

In [None]:
def get_inferences(events, relations, model):
  results = []

  for head in tqdm(events, desc = "Events"):
    for rel in relations:
      query = "{} {} [GEN]".format(head, rel)
      output = model.generate([query], decode_method="beam", num_generate=5)
      results.append({
          "Event": head,
          "Relation": rel,
          "Query": query,
          "Inference": output
      })
  return results

In [None]:
names = ["PersonX"]
assigned_events = assign_names_to_events(names, events)
results = get_inferences(assigned_events, all_relations, comet)
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
df = pd.DataFrame(results)
output_file = f"{output_dir}/comet_inferences_PersonX_{timestamp}.csv"
df.to_csv(output_file, index=False)
print(f"Batch inferences saved to '{output_file}'")


Events: 100%|██████████| 400/400 [35:46<00:00,  5.37s/it]


Batch inferences saved to '/content/drive/MyDrive/cpsc532/output/comet_inferences_PersonX_2025-03-28_18-59-02.csv'


In [None]:
genders = ["female", "male", "unisex"]
for gender in genders:
  if gender == "female":
    names = female_names
  elif gender == "male":
    names = male_names
  else:
    names = unisex_names
  assigned_events = assign_names_to_events(names, events)
  results = get_inferences(assigned_events, all_relations, comet)
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  df = pd.DataFrame(results)
  output_file = f"{output_dir}/comet_inferences_{gender}_{timestamp}.csv"
  df.to_csv(output_file, index=False)
  print(f"Batch inferences saved to '{output_file}'")


Events: 100%|██████████| 400/400 [33:42<00:00,  5.06s/it]


Batch inferences saved to '/content/drive/MyDrive/cpsc532/output/comet_inferences_female_2025-03-25_00-06-56.csv'


Events: 100%|██████████| 400/400 [33:33<00:00,  5.03s/it]


Batch inferences saved to '/content/drive/MyDrive/cpsc532/output/comet_inferences_male_2025-03-25_00-40-30.csv'


Events: 100%|██████████| 400/400 [33:07<00:00,  4.97s/it]


Batch inferences saved to '/content/drive/MyDrive/cpsc532/output/comet_inferences_unisex_2025-03-25_01-13-38.csv'
