# 1.0 Libraries

In [1]:
!pip install langid
!pip install gensim
!pip install -U spacy
!python -m spacy download it_core_news_sm
!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
!tar -xzvf s2v_reddit_2015_md.tar.gz
!pip install sense2vec
!pip install spacy_fastlang

Collecting it-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.7.0/it_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
--2024-04-04 23:05:15--  https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/gith

In [6]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/gdrive')

import re
import json
import gensim.downloader as api

import langid
from gensim.models import Word2Vec
import spacy
import spacy_fastlang

from sense2vec import Sense2Vec
import random

import torch
from transformers import RobertaTokenizer
from transformers import RobertaForMultipleChoice
from torch.distributions import Categorical
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

import time
import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2.0 Set up

In [39]:
def load_data(data_path, gold_path):
  count = 0
  hypernyms_dict = {}
  with open(data_path, "r", encoding = 'utf-8') as data_file, open(gold_path, "r", encoding = 'utf-8') as gold_file:
    for data_line, gold_line in zip(data_file, gold_file):
      term_list = [term for term in data_line.split()[:-1]]
      term = " ".join(term_list)
      hypernyms = [hypernym.replace("\n", "") for hypernym in gold_line.split("\t")]
      hypernyms_dict[term] = hypernyms
      count += 1

      if count == 20:
        break

  return hypernyms_dict

In [40]:
#  PARTIAL italian training data
train_hypernyms = load_data("/content/1B.italian.training.data.txt", "/content/1B.italian.training.gold.txt")

# PARTIAL italian test data
test_hypernyms = load_data("/content/1B.italian.test.data.txt", "/content/1B.italian.test.gold.txt")

# 3.0 Find Distractors

## 3.1 Fasttext


In [9]:
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!sudo python setup.py install

fatal: destination path 'fastText' already exists and is not an empty directory.
python3: can't open file '/content/setup.py': [Errno 2] No such file or directory


### Functions

In [10]:
import fasttext.util
# fasttext.util.download_model('it', if_exists='ignore')

In [11]:
model = fasttext.load_model('cc.it.300.bin')



In [52]:
from scipy.spatial.distance import cosine

# Function that calculates cosine similarity
def cosine_similarity(embedding_1, embedding_2):
    return 1 - cosine(embedding_1, embedding_2)


def find_distractors(hypernym, num_distractors):
  distractors = []

  # Get word embedding for golden label
  golden_embedding = model.get_word_vector(hypernym)

  # Generate three distractors
  candidates = model.get_nearest_neighbors(hypernym, k = 10)
  for distractor in candidates:
    distractor = distractor[1]
    distractor_embedding = model.get_word_vector(distractor)
    similarity = cosine_similarity(golden_embedding, distractor_embedding)
    # Set the range threshold for the cosine similarity
    if 0.1 <= similarity <= 0.6 and distractor not in distractors:
      distractors.append(distractor)

  return distractors[:num_distractors]

### Example

In [53]:
model.get_nearest_neighbors('numero ordinale')

[(0.69635009765625, 'dinale'),
 (0.6637539267539978, 'tecnicoeconomica'),
 (0.6575295329093933, 'numeroe'),
 (0.6210983991622925, 'ordinal'),
 (0.617618203163147, 'Ordinale'),
 (0.6075699925422668, 'Numeroordinale'),
 (0.6015861630439758, 'Numerocardinale'),
 (0.5819660425186157, 'estetologo'),
 (0.5808798670768738, 'Sindacatore'),
 (0.5799951553344727, 'anticardinale')]

In [54]:
model.get_analogies("numero ordinale", "carinale", "millesimo")

[(0.4903110861778259, 'decimillesimo'),
 (0.46558308601379395, 'millesimi'),
 (0.430698961019516, 'duecentesimo'),
 (0.4299042224884033, 'ordinalità'),
 (0.4258556365966797, 'centesimo'),
 (0.421135812997818, 'decimo'),
 (0.4166191816329956, 'trecentesimo'),
 (0.40764376521110535, 'decimillesimi'),
 (0.4020084738731384, 'cinquantesimo'),
 (0.40015530586242676, 'milionesimo')]

In [55]:
distractors = find_distractors("numero ordinale", num_distractors = 3)

In [56]:
for i, distractor in enumerate(distractors):
  print(f"Distractor {i + 1}: {distractor}")

  # Get word embedding for distractor
  distractor_embedding = model.get_word_vector(distractor)

  # Calculate cosine similarity between golden label and distractor embeddings
  golden_embedding = model.get_word_vector("numero ordinale")
  similarity = cosine_similarity(golden_embedding, distractor_embedding)
  print('Cosine similarity: {:.2}'.format(similarity))

Distractor 1: estetologo
Cosine similarity: 0.58
Distractor 2: Sindacatore
Cosine similarity: 0.58
Distractor 3: anticardinale
Cosine similarity: 0.58


# 4.0 Create entries

In [57]:
def save_jsonl(file_path, data):
  id_seq = 0
  with open(file_path, "w") as output_file:
    for term, hypernyms in data.items():
      for hypernym in hypernyms:
        distractors = find_distractors(hypernym, num_distractors = 3)
        entries = (hypernym, *distractors)
        choices = list(entries)
        random.shuffle(choices) # to create randomness
        reformatted_json_data = {
              'id' : id_seq,
              'text': term,
              'choices': choices,
              'label' : choices.index(hypernym)
        }
        json.dump(reformatted_json_data, output_file)
        output_file.write("\n")
        id_seq +=1

def read_lines_jsonl(file_path, num_lines):
  with open(file_path, 'r') as f:
    json_list = list(f)
    for line in json_list[:num_lines]:
      data = json.loads(line)
      print(data)

In [59]:
# train jsonl file
save_jsonl("hypernym_discovery-task26-train-data.jsonl", train_hypernyms)
read_lines_jsonl("hypernym_discovery-task26-train-data.jsonl", num_lines = 10) # preview of the first 10 lines

{'id': 0, 'text': 'sesto', 'choices': ['grado', 'grado.', 'ingrado', 'poter'], 'label': 0}
{'id': 1, 'text': 'sesto', 'choices': ['anticardinale', 'numero ordinale', 'estetologo', 'Sindacatore'], 'label': 1}
{'id': 2, 'text': 'sesto', 'choices': ['frazione', 'frazionale', '-frazione', 'ex-frazione'], 'label': 0}
{'id': 3, 'text': 'sesto', 'choices': ['cariche', 'carica.La', 'ricoperta', 'carica'], 'label': 3}
{'id': 4, 'text': 'Sigillo', 'choices': ['denominatore', 'comune', 'comunee', 'comune.Il'], 'label': 1}
{'id': 5, 'text': 'Sigillo', 'choices': ['Principalità', 'città-provincia', 'municipalità', 'Municipalities'], 'label': 2}
{'id': 6, 'text': 'Sigillo', 'choices': ['comunee', 'monale', 'ComuneAlba', 'comune italiano'], 'label': 3}
{'id': 7, 'text': 'Sigillo', 'choices': ['-frazione', 'frazionale', 'ex-frazione', 'frazione'], 'label': 3}
{'id': 8, 'text': 'Sigillo', 'choices': ['paese'], 'label': 0}
{'id': 9, 'text': 'Sigillo', 'choices': ['quartiere'], 'label': 0}


In [60]:
a = model.get_word_vector("grado")
b = model.get_word_vector("grado.")
cosine_similarity(a, b)

0.558272123336792

In [None]:
# test jsonl file
save_jsonl("hypernym_discovery-task26-test-data.jsonl", test_hypernyms)
read_lines_jsonl("hypernym_discovery-task26-test-data.jsonl", num_lines = 10) # preview of the first 10 lines

# 5.0 Prompt formulation

In [None]:
prompts = [
    "Il termine '{text}' può essere iperonimo di: \n a) {option1} \n b) {option2} \n c) {option3} \n d) {option4}",
    "Dato il termine '{text}', quale tra le seguenti parole è un suo iperonimo? \n a) {option1} \n b) {option2} \n c) {option3} \n d) {option4}",
    "Scegli l'iperonimo del termine '{text}': \n a) {option1} \n b) {option2} \n c) {option3} \n d) {option4}"
]

In [None]:
print(' '.join(prompt + '\n\n' for prompt in prompts), end='')

In [None]:
def save_prompts_jsonl(prompts, file_path):
  json_prompts = []
  for prompt in prompts:
    json_prompts.append({"prompt": prompt})

  with open(file_path, "w") as output_file:
    for json_prompt in json_prompts:
      json.dump(json_prompt, output_file)
      output_file.write("\n")

In [None]:
save_prompts_jsonl(prompts, "hypernym_discovery-task26-json.jsonl")
read_lines_jsonl("hypernym_discovery-task26-json.jsonl", num_lines = 3)

# 6.0 Prompts Evaluation

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
def evaluate_prompt(tokenizer, model, prompt, data_file, zero_shot_classification = False):
  formatted_prompts, y_true, y_pred, score = [], [], [], []
  lines = 0

  with open(data_file, "r") as f:
    json_data = list(f)
    for line in json_data:
      pair = json.loads(line)
      id = pair['id']
      text = pair['text']
      choices = pair['choices']
      label = pair['label']

      formatted_prompt = prompt.format(text = text, option1 = choices[0], option2 = choices[1], option3 = choices[2], option4 = choices[3])
      inputs = tokenizer([formatted_prompt] * len(choices), choices,
                         padding = True, return_tensors = "pt").to(device)

      if zero_shot_classification:
        output = model(inputs, candidate_labels=choices, hypothesis_template="Questo esempio è {}.")
        predicted_output = output['labels'][0]
        predicted_label = choices.index(predicted_output)
        prediction_score = output['scores'][0]
        lines += 1

        if lines == 20:
          break

      else:
        labels = torch.tensor(len(choices) - 1).unsqueeze(0)
        with torch.no_grad():
          output = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels = labels)

        probabilities = torch.softmax(output["logits"][0], -1).tolist()
        predicted_label = np.argmax(probabilities)
        prediction_score = probabilities[predicted_label]
        lines += 1

        if lines == 20:
          break

      formatted_prompts.append(formatted_prompt)
      y_true.append(label)
      y_pred.append(predicted_label)
      score.append(prediction_score)

    return formatted_prompts, y_true, y_pred, score


def visualize_results(results, num_results):
  for prompt in range(len(results)):
    for n in range(num_results):
      print("Prompt: ", results[prompt][0][n])
      print("True label: ", results[prompt][1][n])
      print("Predicted label: ", results[prompt][2][n])
      print("Prediction score: ", round(results[prompt][3][n], 3))
      print(" ")

In [None]:
def compute_metrics(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
  recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
  f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)
  cf_matrix = confusion_matrix(y_true, y_pred)
  return accuracy, precision, recall, f1, cf_matrix


def print_confusion_matrix(metrics, type):
  for n in range(len(metrics)):
    print(f"{type} Confusion Matrix for the Prompt {n}")
    print("Prompt: ", prompts[n])

    # Confusion Matrix Plot
    cf_matrix = metrics[n][4]
    fig, ax = plt.subplots(figsize=(5, 3))
    sns.heatmap(cf_matrix, annot = True, fmt = '.0f')
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    plt.show()

    print("Total Predictions: ", np.sum(cf_matrix))
    print("Correct Predictions: ", np.trace(cf_matrix))
    print("----------------------------------------------------------------")
    print(" ")

def print_overall_statistics(train_metrics, test_metrics, prompts):
  comparison_table = []
  for id, prompt in enumerate(prompts):
    data = {}
    for dtype, metrics in zip(['Train', 'Test'], [train_metrics, test_metrics]):
      accuracy, precision, recall, f1, cf_matrix = metrics[id]
      data[f'{dtype} Accuracy'] = round(accuracy, 3)
      data[f'{dtype} Precision'] = round(precision, 3)
      data[f'{dtype} Recall'] = round(recall, 3)
      data[f'{dtype} F1-score'] = round(f1, 3)
    comparison_table.append(data)
  return pd.DataFrame(comparison_table).transpose()

## 6.1 RoBERTa For Multiple Choice

In [None]:
model_name = "LIAMF-USP/roberta-large-finetuned-race"
roberta_tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = RobertaForMultipleChoice.from_pretrained(model_name).to(device)

In [None]:
start_time = time.time()
roberta_train_results, roberta_test_results = [], []

for prompt in prompts:
  formatted_prompt, y_true, y_pred, score = evaluate_prompt(roberta_tokenizer, roberta_model, prompt, "hypernym_discovery-task26-train-data.jsonl")
  roberta_train_results.append([formatted_prompt, y_true, y_pred, score])

  formatted_prompt, y_true, y_pred, score = evaluate_prompt(roberta_tokenizer, roberta_model, prompt, "hypernym_discovery-task26-test-data.jsonl")
  roberta_test_results.append([formatted_prompt, y_true, y_pred, score])

end_time = time.time()
print("Execution Time: ", (end_time - start_time)/60, "minutes")

Execution Time:  1.9569334228833517 minutes


In [None]:
visualize_results(roberta_train_results, num_results = 2) # preview of the first 2 train results for each prompt

In [None]:
roberta_train_metrics, roberta_test_metrics = [], []

for prompt in range(len(prompts)):
  accuracy, precision, recall, f1, cf_matrix = compute_metrics(roberta_train_results[prompt][1], roberta_train_results[prompt][2])
  roberta_train_metrics.append([accuracy, precision, recall, f1, cf_matrix])

  accuracy, precision, recall, f1, cf_matrix = compute_metrics(roberta_test_results[prompt][1], roberta_test_results[prompt][2])
  roberta_test_metrics.append([accuracy, precision, recall, f1, cf_matrix])

### Overall Statistics

In [None]:
# Train Statistics for each prompt
print_confusion_matrix(roberta_train_metrics, "Train")

In [None]:
# Test Statistics for each prompt
print_confusion_matrix(roberta_test_metrics, "Test")

In [None]:
print_overall_statistics(roberta_train_metrics, roberta_test_metrics, prompts)

## 6.2 Zero Shot Text Classification

In [None]:
classifier_names = ["xlm-roberta-large", "facebook/bart-large-mnli"]
#roberta_classifier = pipeline("zero-shot-classification", model=classifier_names[0], batch_size = 8, truncation=True, device = device)

bert_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
bert_classifier = pipeline("zero-shot-classification", model=classifier_names[1], batch_size = 8, truncation=True, device = device)

In [None]:
start_time = time.time()
classifier_train_results, classifier_test_results = [], []

for prompt in prompts:
  formatted_prompt, y_true, y_pred, score = evaluate_prompt(bert_tokenizer, bert_classifier, prompt, "hypernym_discovery-task26-train-data.jsonl", True)
  classifier_train_results.append([formatted_prompt, y_true, y_pred, score])

  formatted_prompt, y_true, y_pred, score = evaluate_prompt(bert_tokenizer, bert_classifier, prompt, "hypernym_discovery-task26-test-data.jsonl", True)
  classifier_test_results.append([formatted_prompt, y_true, y_pred, score])

end_time = time.time()
print("Execution Time: ", (end_time - start_time)/60, "minutes")

Execution Time:  0.9809847831726074 minutes


In [None]:
visualize_results(classifier_train_results, num_results = 2) # preview of the first 2 train results for each prompt

In [None]:
classifier_train_metrics, classifier_test_metrics = [], []

for prompt in range(len(prompts)):
  accuracy, precision, recall, f1, cf_matrix = compute_metrics(classifier_train_results[prompt][1], classifier_train_results[prompt][2])
  classifier_train_metrics.append([accuracy, precision, recall, f1, cf_matrix])

  accuracy, precision, recall, f1, cf_matrix = compute_metrics(classifier_test_results[prompt][1], classifier_test_results[prompt][2])
  classifier_test_metrics.append([accuracy, precision, recall, f1, cf_matrix])

### Overall Statistics

In [None]:
# Train Confusion Matrix for each prompt
print_confusion_matrix(classifier_train_metrics, "Train")

In [None]:
# Test Statistics for each prompt
print_confusion_matrix(classifier_test_metrics, "Test")

In [None]:
print_overall_statistics(classifier_train_metrics, classifier_test_metrics, prompts)