In [1]:
import pandas as pd
import ast
import re
import matplotlib.pyplot as plt
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phimini = pd.read_csv('/home/saracandussio/tesi-risultati/results/baseline-phimini-hotpotqa.csv')
phimedium = pd.read_csv('/home/saracandussio/tesi-risultati/results/baseline-phimedium-hotpotqa.csv')
llama3 = pd.read_csv('/home/saracandussio/tesi-risultati/results/baseline-llama-3.1-8b-it-hotpotqa.csv')
gemma2b = pd.read_csv('/home/saracandussio/tesi-risultati/results/baseline-gemma-2-2b-it-hotpotqa.csv')
gemma9b = pd.read_csv('/home/saracandussio/tesi-risultati/results/baseline-gemma-2-9b-it-hotpotqa.csv')

In [3]:
def clean_text_final(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

In [4]:
# phi-mini
phimini['correct'] = phimini['correct'].apply(clean_text_final)
phimini['thesis'] = phimini['thesis'].apply(clean_text_final)
phimini['synthesis'] = phimini['synthesis'].apply(clean_text_final)
phimini['goat'] = phimini['goat'].apply(clean_text_final)

# phi-medium
phimedium['correct'] = phimedium['correct'].apply(clean_text_final)
phimedium['thesis'] = phimedium['thesis'].apply(clean_text_final)
phimedium['synthesis'] = phimedium['synthesis'].apply(clean_text_final)
phimedium['goat'] = phimedium['goat'].apply(clean_text_final)

# llama3
llama3['correct'] = llama3['correct'].apply(clean_text_final)
llama3['thesis'] = llama3['thesis'].apply(clean_text_final)
llama3['synthesis'] = llama3['synthesis'].apply(clean_text_final)

# gemma-2b
gemma2b['correct'] = gemma2b['correct'].apply(clean_text_final)
gemma2b['thesis'] = gemma2b['thesis'].apply(clean_text_final)
gemma2b['synthesis'] = gemma2b['synthesis'].apply(clean_text_final)

# gemma-9b
gemma9b['correct'] = gemma9b['correct'].apply(clean_text_final)
gemma9b['thesis'] = gemma9b['thesis'].apply(clean_text_final)
gemma9b['synthesis'] = gemma9b['synthesis'].apply(clean_text_final)

# Impact of the best suggestion

# TL;DR

|which model|baseline - thesis|baseline - synthesis|bart - synthesis|roberta - synthesis| oracle - synthesis| nocontext - thesis | nocontext - synthesis | CoT as antithesis |
|:---:|:---:|:---:|:---:|:---:|:---:| :---:|:---:| :---:| 
| phi-mini | 53.41% | 80.68% | 77.27% | 76.70% | 89.20% | 51.70% | 74.72% | 87.22% |
| phi-medium | 50.00% | 75.28% | 79.26% | 78.69% | 83.24% | 48.01% | 80.97% | 78.69% |
| llama-3.1-it-8b | 48.30% | 77.84% | / | / | 85.23% | / | / | / |
| gemma-2-2b-it | 52.84% | 57.67% | / | / | 55.11% | / | / | / |
| gemma-2-9b-it | 59.66% | 88.35% | / | / | 94.89% | / | / | / |

|to-do models| status |
|:---:|:---:|
| gemma-2-27b-it | running |
| phi-small | to be considered? |

# Comparison of success rates

In [8]:
def evaluation(correct_answer, candidate):
    matches = 0
    total = len(correct_answer)

    for i in range(total):
        
        if correct_answer[i] == candidate[i]:
            matches += 1

    return round(matches/total*100, 2)

In [9]:
print("Portion of correct: ")
print("- thesis answers: {}%".format(evaluation(phimini['correct'], phimini['thesis'])))
print("- baseline synthesis answers: {}%".format(evaluation(phimini['correct'], phimini['synthesis'])))
print("- oracle synthesis answers: {}%".format(evaluation(phimini['correct'], phimini['goat'])))
print("- BART synthesis answers: {}%".format(evaluation(phimini['correct'], phimini['synthesis'])))
print("- RoBERTa synthesis answers: {}%".format(evaluation(phimini['correct'], phimini['synthesis'])))

Portion of correct: 
- thesis answers: 53.41%
- baseline synthesis answers: 80.68%
- oracle synthesis answers: 89.2%
- BART synthesis answers: 80.68%
- RoBERTa synthesis answers: 80.68%


In [10]:
print("Portion of correct: ")
print("- thesis answers: {}%".format(evaluation(phimedium['correct'], phimedium['thesis'])))
print("- baseline synthesis answers: {}%".format(evaluation(phimedium['correct'], phimedium['synthesis'])))
print("- oracle synthesis answers: {}%".format(evaluation(phimedium['correct'], phimedium['goat'])))
print("- BART synthesis answers: {}%".format(evaluation(phimedium['correct'], phimedium['synthesis'])))
print("- RoBERTa synthesis answers: {}%".format(evaluation(phimedium['correct'], phimedium['synthesis'])))

Portion of correct: 
- thesis answers: 50.0%
- baseline synthesis answers: 75.28%
- oracle synthesis answers: 83.24%
- BART synthesis answers: 75.28%
- RoBERTa synthesis answers: 75.28%


In [12]:
print("Portion of correct: ")
print("- thesis answers: {}%".format(evaluation(llama3['correct'], llama3['thesis'])))
print("- baseline synthesis answers: {}%".format(evaluation(llama3['correct'], llama3['synthesis'])))
print("- oracle synthesis answers: {}%".format(evaluation(llama3['correct'], llama3['oracle'])))
# print("- BART synthesis answers: {}%".format(evaluation(llama3['correct'], llama3['synthesis'])))
# print("- RoBERTa synthesis answers: {}%".format(evaluation(llama3['correct'], llama3['synthesis'])))

Portion of correct: 
- thesis answers: 48.3%
- baseline synthesis answers: 77.84%
- oracle synthesis answers: 85.23%


In [13]:
# dati strani. devo rifare la run...

print("Portion of correct: ")
print("- thesis answers: {}%".format(evaluation(gemma2b['correct'], gemma2b['thesis'])))
print("- baseline synthesis answers: {}%".format(evaluation(gemma2b['correct'], gemma2b['synthesis'])))
print("- oracle synthesis answers: {}%".format(evaluation(gemma2b['correct'], gemma2b['oracle'])))
# print("- BART synthesis answers: {}%".format(evaluation(gemma2b['correct'], gemma2b['synthesis'])))
# print("- RoBERTa synthesis answers: {}%".format(evaluation(gemma2b['correct'], gemma2b['synthesis'])))

Portion of correct: 
- thesis answers: 52.84%
- baseline synthesis answers: 57.67%
- oracle synthesis answers: 55.11%


In [14]:
print("Portion of correct: ")
print("- thesis answers: {}%".format(evaluation(gemma9b['correct'], gemma9b['thesis'])))
print("- baseline synthesis answers: {}%".format(evaluation(gemma9b['correct'], gemma9b['synthesis'])))
print("- oracle synthesis answers: {}%".format(evaluation(gemma9b['correct'], gemma9b['oracle'])))
# print("- BART synthesis answers: {}%".format(evaluation(gemma9b['correct'], gemma9b['synthesis'])))
# print("- RoBERTa synthesis answers: {}%".format(evaluation(gemma9b['correct'], gemma9b['synthesis'])))

Portion of correct: 
- thesis answers: 59.66%
- baseline synthesis answers: 88.35%
- oracle synthesis answers: 94.89%
