In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

import re
import json
import gensim.downloader as api

!pip install langid
import langid

!pip install gensim
from gensim.models import Word2Vec

!pip install -U spacy
!python -m spacy download it_core_news_sm
import spacy

!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
!tar -xzvf s2v_reddit_2015_md.tar.gz
!pip install sense2vec
from sense2vec import Sense2Vec

import random

Mounted at /content/gdrive
Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941172 sha256=e94db685cd64cba0fca4582f1e05c0f09af0390d543fab559cbdc1c330074ee5
  Stored in directory: /root/.cache/pip/wheels/23/c8/c6/eed80894918490a175677414d40bd7c851413bbe03d4856c3c
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6
Collecting it-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.7.0/it_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m95.

# 1. Set up

In [2]:
def load_data(data_path, gold_path):
  hypernyms_dict = {}
  with open(data_path, "r", encoding = 'utf-8') as data_file, open(gold_path, "r", encoding = 'utf-8') as gold_file:
    for data_line, gold_line in zip(data_file, gold_file):
      term_list = [term for term in data_line.split()[:-1]]
      term = " ".join(term_list)
      hypernyms = [hypernym.replace("\n", "") for hypernym in gold_line.split("\t")]
      hypernyms_dict[term] = hypernyms
  return hypernyms_dict


def load_vocabulary(vocabulary_path):
  with open(vocabulary_path, "r", encoding = 'utf-8') as vocabulary_file:
    vocabulary = vocabulary_file.read().splitlines()
  return vocabulary


def clean_vocabulary(vocabulary):
  new_vocabulary = []
  for word in vocabulary:
    if word.isalpha(): # filter out numeric terms
      language, confidence = langid.classify(word)
      if language == 'it': # filter out non-italian words
        new_vocabulary.append(word)
  return new_vocabulary

In [3]:
data_path = "/content/gdrive/MyDrive/Colab Notebooks/MNLP/Homework/{split}/data/1B.italian.{split}.data.txt"
gold_path = "/content/gdrive/MyDrive/Colab Notebooks/MNLP/Homework/{split}/gold/1B.italian.{split}.gold.txt"
vocabulary_path = "/content/gdrive/MyDrive/Colab Notebooks/MNLP/Homework/vocabulary/1B.italian.vocabulary.txt"

# italian training data
train_data_path = data_path.format(split = 'training')
train_gold_path = gold_path.format(split = 'training')
train_hypernyms = load_data(train_data_path, train_gold_path)

# italian test data
test_data_path = data_path.format(split = 'test')
test_gold_path = gold_path.format(split = 'test')
test_hypernyms = load_data(test_data_path, test_gold_path)

# italian trial data ... maybe not useful
trial_data_path = data_path.format(split = 'trial')
trial_gold_path = gold_path.format(split = 'trial')
trial_hypernyms = load_data(trial_data_path, trial_gold_path)

In [4]:
# italian vocabulary
vocabulary = load_vocabulary(vocabulary_path)
print("Number of terms in the vocabulary: ", len(vocabulary))

vocabulary = clean_vocabulary(vocabulary)
for sublist_train, sublist_test in zip(train_hypernyms.values(), test_hypernyms.values()):
  vocabulary.extend(sublist_train)
  vocabulary.extend(sublist_test)

vocabulary = list(set(vocabulary)) # to avoid duplicates

Number of terms in the vocabulary:  79715


In [5]:
print("Number of terms in the vocabulary: ", len(vocabulary))
vocabulary[:20]

Number of terms in the vocabulary:  15786


['percolare',
 'levitazione',
 'seccatore',
 'indipendenza',
 'distensione',
 'autorialità',
 'permutazione',
 'gioiella',
 'artistico',
 'amministrazione',
 'complicazione',
 'cristallizzare',
 'ristabilimento',
 'sveltezza',
 'intossicazione',
 'tentennamento',
 'impressionabilità',
 'recrudescenza',
 'superfamiglia',
 'cercis']

# 2. Create entries

In [6]:
# word2vec model
sentences = [vocabulary]
w2v_model = Word2Vec(sentences, min_count=1)

In [7]:
# sense2vec model
s2v_model = Sense2Vec().from_disk("./s2v_old")

In [8]:
def find_distractors(hypernym, data, term, topn):
  # sense = s2v_model.get_best_sense(hypernym)
  # if sense:
  #   distractors = s2v_model.most_similar(sense, n=5)
  #   filtered_distractors = [distractor[0].split("|")[0].replace("_", " ") for distractor in distractors if distractor[0].split("|")[0].replace("_", " ") not in data[term]]
  #   return filtered_distractors[:topn]
  # else:
  distractors = w2v_model.wv.most_similar(hypernym, topn=5)
  filtered_distractors = [distractor[0] for distractor in distractors if distractor[0] not in data[term]]
  return filtered_distractors[:topn]

def create_entries(data, vocabulary):
  entries = []
  for id,term in enumerate(data):
    for hypernym in data[term]:
      distractors = find_distractors(hypernym, data, term, 3)
      choices = [hypernym, *distractors]
      random.shuffle(choices)
      entry = {
            'id' : id,
            'text': term,
            'choices': choices,
            'label' : choices.index(hypernym)
       }
      entries.append(entry)
  return entries


def save_entries_json(file_path, entries):
  with open(file_path, "w") as output_file:
      json.dump(entries, output_file, indent = 2)


In [9]:
train_entries = create_entries(train_hypernyms, vocabulary)
save_entries_json("train_dataset.json", train_entries)

In [10]:
train_entries[:10]

[{'id': 0,
  'text': 'sesto',
  'choices': ['grado', 'casa reale', 'confetto', 'fulminazione'],
  'label': 0},
 {'id': 0,
  'text': 'sesto',
  'choices': ['innovazione',
   'casalmaggiore',
   'specialmente',
   'numero ordinale'],
  'label': 3},
 {'id': 0,
  'text': 'sesto',
  'choices': ['frazione', 'avvinghiare', 'frascheggiare', 'particella'],
  'label': 0},
 {'id': 0,
  'text': 'sesto',
  'choices': ['percorribile', 'settantesimo', 'carica', 'alpeggio'],
  'label': 2},
 {'id': 1,
  'text': 'Sigillo',
  'choices': ['oppugnazione', 'laminato', 'comune', 'giocolare'],
  'label': 2},
 {'id': 1,
  'text': 'Sigillo',
  'choices': ['precauzionale', 'supremazia', 'municipalità', 'inoppugnabile'],
  'label': 2},
 {'id': 1,
  'text': 'Sigillo',
  'choices': ['comune italiano', 'osservatorio', 'lacuna', 'attuabilità'],
  'label': 0},
 {'id': 1,
  'text': 'Sigillo',
  'choices': ['frascheggiare', 'avvinghiare', 'frazione', 'particella'],
  'label': 2},
 {'id': 1,
  'text': 'Sigillo',
  'choic

In [11]:
test_entries = create_entries(test_hypernyms, vocabulary)
save_entries_json("test_dataset.json", test_entries)

In [12]:
test_entries[:10]

[{'id': 0,
  'text': 'tecnologia',
  'choices': ['installare', 'disciplina', 'impiegatizio', 'scarto'],
  'label': 1},
 {'id': 0,
  'text': 'tecnologia',
  'choices': ['pusillanimità', 'produzione', 'acculturazione', 'spogliare'],
  'label': 1},
 {'id': 0,
  'text': 'tecnologia',
  'choices': ['muggine', 'metallaro', 'atomicità', 'scienza'],
  'label': 3},
 {'id': 1,
  'text': 'schermata',
  'choices': ['politica estera', 'battipanni', 'schematizzare', 'immagine'],
  'label': 3},
 {'id': 1,
  'text': 'schermata',
  'choices': ['zibetto', 'illustrazione', 'sermenza', 'piroetta'],
  'label': 1},
 {'id': 1,
  'text': 'schermata',
  'choices': ['poscia', 'riproduzione', 'disconoscere', 'infetto'],
  'label': 1},
 {'id': 1,
  'text': 'schermata',
  'choices': ['ampolla', 'metilazione', 'malacreanza', 'rappresentazione'],
  'label': 3},
 {'id': 1,
  'text': 'schermata',
  'choices': ['inventariazione',
   'capillarità',
   'obbligazionista',
   'raffigurazione'],
  'label': 3},
 {'id': 1,
  

# 3. Prompt formulation

In [13]:
prompts = [
    "Il termine '{{text}}' può essere iperonimo di: {{choices}}",
    "Dato il termine '{{text}}', quale tra le seguenti parole è un suo iperonimo? {{choices}}",
    "Scegli l'iperonimo del termine ''{{text}}: {{choices}}"
]

In [14]:
def save_prompts_json(prompts, file_path):
  json_prompts = []
  for prompt in prompts:
    json_prompts.append({"prompt": prompt})

  with open(file_path, "w") as output_file:
    json.dump(json_prompts, output_file, indent=2)

In [15]:
save_prompts_json(prompts, "prompts.json")

In [16]:
with open( "prompts.json", "r") as output_file:
  print(output_file.read())

[
  {
    "prompt": "Il termine '{{text}}' pu\u00f2 essere iperonimo di: {{choices}}"
  },
  {
    "prompt": "Dato il termine '{{text}}', quale tra le seguenti parole \u00e8 un suo iperonimo? {{choices}}"
  },
  {
    "prompt": "Scegli l'iperonimo del termine ''{{text}}: {{choices}}"
  }
]


# 4. Llama 2

In [17]:
# Device agnostic code
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [18]:
# Configuration
runtimeFlag = device
cache_dir = None
scaling_factor = 1.0

In [19]:
!pip install -q -U transformers peft accelerate optimum

!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.9/409.9 kB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [20]:
!pip install -q -U pdfminer.six # could maybe add pre-built wheels to speed this up.

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/5.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m5.4/5.6 MB[0m [31m78.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [22]:
# Load Model
import transformers
import torch
import json
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer_llama = AutoTokenizer.from_pretrained(model_id)
model_llama = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

## Perfomance Evaluation with Llama 2

In [23]:
import json
import torch
import torch.nn.functional as F

with open('train_dataset.json', 'r') as file:
    entities = json.load(file)[:6]
    for entity in entities:
      print(entity)
      id = entity['id']
      text = entity['text']
      choices = entity['choices']
      label = entity['label']

      prompt_formatted = f"Il termine '{text}' può essere iperonimo di: {choices}"
      input = tokenizer_llama(prompt_formatted, return_tensors="pt").to(device)
      output = model_llama(**input)

      probabilities = F.softmax(output.logits, dim=-1)
      probabilities = probabilities.squeeze().tolist()

      # Find the index of the choice with the highest probability
      for choice, prob in zip(choices, probabilities):
            print(f"Choice: {choice}, Probability: {prob}\n")

      # logits = output.logits[0].detach().cpu().numpy()

      # probabilities = torch.softmax(torch.tensor(logits), -1).detach().cpu().numpy()
      # print(f"Probabilities: {probabilities}")
      # prediction_index = probabilities.argmax()
      # prediction_label = choices[prediction_index]
      # prediction_confidence = probabilities[prediction_index] * 100
      # prediction = {"label": prediction_index, "confidence": round(prediction_confidence, 1)}

      # is_correct = prediction_index == label

      # print("Prompt:", prompt_formatted)
      # print("Prediction:", prediction)
      # print("Actual Label:", label)
      # print("Correct Prediction:", is_correct)
      # print()

{'id': 0, 'text': 'sesto', 'choices': ['grado', 'casa reale', 'confetto', 'fulminazione'], 'label': 0}
Choice: grado, Probability: 0.0006242584204301238

Choice: casa reale, Probability: 0.9990449547767639

Choice: confetto, Probability: 0.00033071625512093306

{'id': 0, 'text': 'sesto', 'choices': ['innovazione', 'casalmaggiore', 'specialmente', 'numero ordinale'], 'label': 3}
Choice: innovazione, Probability: 0.0004342744068708271

Choice: casalmaggiore, Probability: 0.9993900060653687

Choice: specialmente, Probability: 0.00017576207756064832

{'id': 0, 'text': 'sesto', 'choices': ['frazione', 'avvinghiare', 'frascheggiare', 'particella'], 'label': 0}
Choice: frazione, Probability: 0.0008978174300864339

Choice: avvinghiare, Probability: 0.9987058639526367

Choice: frascheggiare, Probability: 0.00039627825026400387

{'id': 0, 'text': 'sesto', 'choices': ['percorribile', 'settantesimo', 'carica', 'alpeggio'], 'label': 2}
Choice: percorribile, Probability: 0.0007527181878685951

Choic

# 5. Roberta

In [24]:
import json
import torch
from torch.distributions import Categorical
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("LIAMF-USP/roberta-large-finetuned-race")
model = AutoModelForSequenceClassification.from_pretrained("LIAMF-USP/roberta-large-finetuned-race")

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at LIAMF-USP/roberta-large-finetuned-race and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
