In [None]:
!pip install transformers



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


In [None]:
!rm -rf predictions.txt
!rm -rf raw_predictions.tsv

In [None]:
from transformers import AutoTokenizer, RobertaForMaskedLM
import torch
import json

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base").cuda()

# Load dataset
dataset = None
with open("dev.json", 'r') as file:
    dataset = json.load(file)['data']

file_data = []

count = 0
for sample in dataset:
    if not sample['qas'][0]["answers"] or len(sample['qas'][0]["answers"][0]) == 0:
        print("Skipping sample with no answers")
        continue

    # Ensure entities are considered for predictions
    entities_indexes = sample['passage']['entities']
    entities = []
    for index in entities_indexes:
        entities.append(sample['passage']['text'][index['start']:(index['end']+1)])

    # Mask the query
    masked_query = sample['qas'][0]['query'].replace("@placeholder", "<mask>")
    # Combine the passage and query
    query = sample['passage']['text'] + "\n\n" + masked_query

    with torch.no_grad():
        inputs = tokenizer(query, return_tensors="pt").to('cuda')
        # Should be only one sample in the dev.json file with more than 512 tokens
        if (len(inputs['input_ids'][0]) > 512):
            print("Skipping sample with too long query")
            continue

        # Get the logits from the model of the masked token
        outputs = model(**inputs)
        mask_token_index = (inputs['input_ids'] == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        logits = outputs.logits[0, mask_token_index][0]

        # Get logits for each entity
        entity_ids = [tokenizer.encode(entity, add_special_tokens=False) for entity in entities]

        # Calculate likelihoods for each entity (average of the logits for each token in the entity)
        likelihoods = []
        for entity_id in entity_ids:
            likelihood = 0
            for token in entity_id:
                likelihood += logits[token]
            likelihoods.append(likelihood / len(entity_id))

        # Get the entity with the highest likelihood
        prediction = entities[likelihoods.index(max(likelihoods))]

        file_data.append((query, prediction, sample['qas'][0]["answers"][0]["text"]))

        # Write to file and log progress, every 100 samples
        if count % 100 == 0:
            print(f"Processed {count} samples")
            # Write the results to a file
            with open("predictions.txt", "a") as f:
                for query, prediction, answer in file_data:
                    f.write(f"Query: {query}\n")
                    f.write(f"Prediction: {prediction}\n")
                    f.write(f"Answer: {answer}\n")
                    f.write("\n")

            # Write raw results to a file (for further metric calculation)
            with open("raw_predictions.tsv", "a") as f:
                for _, prediction, answer in file_data:
                    f.write(f"{prediction}\t{answer}\n")

            file_data = []

        count += 1



Processed 0 samples
Processed 100 samples
Processed 200 samples
Processed 300 samples
Processed 400 samples
Processed 500 samples
Processed 600 samples
Processed 700 samples
Processed 800 samples
Processed 900 samples
Processed 1000 samples
Processed 1100 samples
Processed 1200 samples
Processed 1300 samples
Processed 1400 samples
Processed 1500 samples
Processed 1600 samples
Processed 1700 samples
Processed 1800 samples
Processed 1900 samples
Processed 2000 samples
Processed 2100 samples
Processed 2200 samples
Processed 2300 samples
Processed 2400 samples
Processed 2500 samples
Processed 2600 samples
Processed 2700 samples
Processed 2800 samples
Processed 2900 samples
Processed 3000 samples
Processed 3100 samples
Processed 3200 samples
Processed 3300 samples
Processed 3400 samples
Processed 3500 samples
Processed 3600 samples
Processed 3700 samples
Processed 3800 samples
Processed 3900 samples
Processed 4000 samples
Processed 4100 samples
Processed 4200 samples
Processed 4300 samples


Token indices sequence length is longer than the specified maximum sequence length for this model (1002 > 512). Running this sequence through the model will result in indexing errors


Skipping sample with too long query
Processed 4700 samples
Processed 4800 samples
Processed 4900 samples
Processed 5000 samples
Processed 5100 samples
Processed 5200 samples
Processed 5300 samples
Processed 5400 samples
Processed 5500 samples
Processed 5600 samples
Processed 5700 samples
Processed 5800 samples
Processed 5900 samples
Processed 6000 samples
Processed 6100 samples
Processed 6200 samples
Processed 6300 samples
Processed 6400 samples
Processed 6500 samples
Processed 6600 samples
Processed 6700 samples
Processed 6800 samples
Processed 6900 samples
Processed 7000 samples
Processed 7100 samples
Processed 7200 samples
Processed 7300 samples
Processed 7400 samples


In [3]:
!pip install scikit-learn



In [8]:
import csv
from sklearn.metrics import f1_score

def load_data(filename):
    predictions, actuals = [], []
    with open(filename, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) == 2:  # Ensure there are exactly two columns
                predictions.append(row[0])
                actuals.append(row[1])
            else:
                raise ValueError("Each row must contain exactly two columns.")

    return predictions, actuals

def calculate_metrics(predictions, actuals):
    f1 = f1_score(actuals, predictions, average='weighted')

    # Exact match calculation
    exact_matches = sum(1 for i in range(len(predictions)) if predictions[i] == actuals[i])
    exact_match_score = exact_matches / len(predictions)

    return f1, exact_match_score

def main():
    predictions, actuals = load_data('raw_predictions.tsv')
    f1, exact_match_score = calculate_metrics(predictions, actuals)

    print("F1 Score:", f1)
    print("Exact Match Score:", exact_match_score)

if __name__ == "__main__":
    main()


F1 Score: 0.406759466760207
Exact Match Score: 0.4403458992028104
