# Punto 4 - Italiano
Context Q&A system about COVID in a general setting. Type: Transformer.

In [23]:
from transformers import BertForQuestionAnswering, AutoTokenizer
from transformers import pipeline
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle as pk
import string, re
import torch
import json

In [24]:
## Configuraciones generales

In [25]:
do_fine_tuning_locally = False

'''
Se deben crear los directorios en caso de no existir
'''

lang = "italian"

#HPC calculated files
input_encodings_path = f"data_hpc/{lang}/train_encoding_{lang}.pkl"
input_model_path = f"data_hpc/{lang}/input_model_{lang}.pkl"
output_model_path = f"data_hpc/{lang}/model_post_tunning_{lang}.pkl"

#Models path
model_folder_path = f'models/{lang}/qna_english_custom'

#Dataset
dataset_path = f"qna_database/{lang}/{lang}.json"

#BERT model name
modelname = 'mrm8488/bert-italian-finedtuned-squadv1-it-alfa'

## Proceso de carga del Dataset traducido

In [26]:
contexts = []
questions = []
answers = []

with open(dataset_path, "r") as file:
    for line in file:
        doc = json.loads(line)
        #print(doc)
        #print(doc["context"])
        contexts.append(doc["context"])
        answers.append(doc["answer"])
        questions.append(doc["question"])

print(f"Contextos {str(len(contexts))}, Preguntas {str(len(questions))}, Respuestas {str(len(answers))}")


Contextos 940, Preguntas 940, Respuestas 940


Separación en training y test

In [27]:
#Versión completa
# num_docs = len(contexts)
# ochenta = int(num_docs*0.8)
#
# train_contexts = contexts[0:ochenta]
# train_questions = questions[0:ochenta]
# train_answers = answers[0:ochenta]
# test_contexts = contexts[ochenta+1:num_docs]
# test_questions = questions[ochenta+1:num_docs]
# test_answers = answers[ochenta+1:num_docs]

#Verisón corta
num_docs = 100
ochenta = int(num_docs*0.8)

train_contexts = contexts[0:ochenta]
train_questions = questions[0:ochenta]
train_answers = answers[0:ochenta]
test_contexts = contexts[ochenta+1:num_docs]
test_questions = questions[ochenta+1:num_docs]
test_answers = answers[ochenta+1:num_docs]

In [28]:
print(len(train_contexts))
print(len(test_contexts))

80
19


## Aproximación 1 - Already Fine-Tuned BERT transformer

In [29]:
model = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)


### Creación del pipeline

In [30]:
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

### Pruebas Genéricas

In [31]:
context = "L’incidenza sull’intero territorio nazionale continua a diminuire e ha raggiunto valori che, attraverso l’attivazione di intense attività di tracciamento sistematico, possono consentire una gestione basata sul contenimento ovvero sull’identificazione dei casi e sul tracciamento dei loro contatti. La pressione sui servizi ospedalieri si conferma al di sotto della soglia critica in tutte le Regioni/PA e la stima dell’indice di trasmissibilità Rt medio calcolato sui casi sintomatici è stabilmente al di sotto della soglia epidemica.La prevalente circolazione in Italia della variante B.1.1.7 (nota come variante inglese) e la presenza di altre varianti che possono eludere parzialmente la risposta immunitaria, richiede tuttavia di continuare a monitorare con attenzione la situazione e mantenere cautela e gradualità nella gestione dell’epidemia."

nlp({
    'question': "¿Cual'è la variante prevalente?",
    'context': context
})

{'score': 0.4643324315547943,
 'start': 605,
 'end': 621,
 'answer': 'variante inglese'}

In [32]:
context = "Il Piano, elaborato da Ministero della Salute, Commissario Straordinario per l’Emergenza, Istituto Superiore di Sanità, Agenas e Aifa, è stato adottato con Decreto del 12 marzo 2021. Il 13 marzo 2021 è stato diffuso il Piano vaccinale del Commissario straordinario per l’esecuzione della campagna vaccinale nazionale. Elaborato in armonia con il Piano strategico nazionale del Ministero della Salute, fissa le linee operative per completare al più presto la campagna vaccinale."

nlp({
    'question': "¿Quando si è adottato il Piano?",
    'context': context
})

{'score': 0.9873538613319397,
 'start': 168,
 'end': 181,
 'answer': '12 marzo 2021'}

In [33]:
def get_prediction(qid, questions, contexts):
    specific_question = questions[qid]
    specific_context = contexts[qid]

    answer = nlp({
    'question': specific_question,
    'context': specific_context
    })
    #print(answer)
    return answer["answer"]

### Métricas para Q&A
#### F1 Score y Exact Match

In [34]:
#Adaptado de: Evaluating QA: Metrics, Predictions, and the Null Response
#https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""


    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)


## Dataset
### Dataset COVID-QA
COVID-QA es un conjunto de datos de respuestas a preguntas que consta de 2,019 pares de preguntas / respuestas anotados por expertos biomédicos voluntarios en artículos científicos relacionados con COVID-19. Un total de 147 artículos científicos del conjunto de datos CORD-19 fueron anotados por 15 expertos.

Tomado de: https://huggingface.co/datasets/covid_qa_deepset
Traducido automaticamente al español

In [35]:
# El formato del dataset tenía listas en vez de string e ints

def ajustar_answers(answers):
    for answer in answers:
         answer['text'] = answer['text'][0]
         answer['answer_start'] = int(answer['answer_start'][0])
    return answers

train_answers = ajustar_answers(train_answers)
test_answers = ajustar_answers(test_answers)

## Evaluación Aproximación 1
Modelo ya Fine-Tuned para Q&A

- Prueba Unitaria:

In [36]:
n = 0
prediction = get_prediction(n, questions=test_questions, contexts=test_contexts)
#print(prediction)
answer = test_answers[n]["text"]
#print(answer)

em_score = compute_exact_match(prediction, answer)
f1_score = compute_f1(prediction, answer)

print(f"Question: {test_questions[n]}")
print(f"Prediction: {prediction}")
print(f"Answer: {answer}")
print(f"EM: {em_score} \t F1: {f1_score}")

Question: Quale invenzione tecnologica ha prodotto anticorpi che sono cloni di un'unica cellula madre?
Prediction: ibridomi
Answer: negli anni '70 con lo sviluppo della tecnologia degli ibridomi per produrre anticorpi monoclonali
EM: 0 	 F1: 0.13333333333333333


- Prueba con todos los datos de test:

In [37]:
def average_metrics(contexts, answers, questions):
    if len(contexts)!= len(answers) or len(contexts)!= len(questions):
        print("Hay un error en el tamaño de los arreglos")
        return
    test_size = len(contexts)
    em_average = 0
    f1_average = 0
    for n in range(test_size):
        prediction = get_prediction(n, questions, contexts)
        answer = answers[n]["text"]
        em_score = compute_exact_match(prediction, answer)
        f1_score = 1 if em_score == 1 else compute_f1(prediction, answer)
        em_average += em_score
        f1_average += f1_score
    em_average_tot = em_average/test_size
    f1_average_tot = f1_average/test_size
    return em_average_tot, f1_average_tot

In [38]:
em_average, f1_score = average_metrics(test_contexts, test_answers, test_questions)

In [39]:
print(f"Average F1: {f1_score} \nAverage EM: {em_average}")


Average F1: 0.25972826803851734 
Average EM: 0.10526315789473684


## Aproximación 2 - Fine Tunnind with COVID-19 Q&A Questions

Aproximación inspirada en
https://gist.github.com/jamescalam/55daf50c8da9eb3a7c18de058bc139a3

In [40]:
#Basado en:

def add_end_idx(answers, contexts):
    count = 0
    print(len(answers))
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
            count = count+1
        else:
            for n in range(4):
                problema = True
                if gold_text == context[start_idx+n:end_idx+n]:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx + n
                    answer['answer_end'] = end_idx + n
                    #print("entra4")
                    count = count+1
                    problema = False
                    break
                if gold_text == context[start_idx-n:end_idx-n]:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
                    count = count+1
                    problema = False
                    break
            if problema:
                print("1",gold_text)
                print("2",context[start_idx+n:end_idx+n])
    print(count)

add_end_idx(train_answers, train_contexts)
add_end_idx(test_answers, test_contexts)

80
80
19
19


In [41]:
# Nuevamente se importa el modelo

model = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [42]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)

In [46]:
class SquadDataset(torch.utils.data.Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        def __len__(self):
            return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)

In [47]:
with open(input_encodings_path, "wb") as file:
    pk.dump(train_dataset, file)

with open(input_model_path, "wb") as file:
    pk.dump(model, file)

In [44]:
'''
Fine tune your model locally or excecute it with the HPC
'''
if do_fine_tuning_locally:

    # setup GPU/CPU
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # move model over to detected device
    model.to(device)
    # activate training mode of model
    model.train()
    # initialize adam optimizer with weight decay (reduces chance of overfitting)
    optim = AdamW(model.parameters(), lr=5e-5)

    # initialize data loader for training data
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    for epoch in range(3):
        # set model to train mode
        model.train()
        # setup loop (we use tqdm for the progress bar)
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all the tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            # train model on batch and return outputs (incl. loss)
            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            # extract loss
            loss = outputs[0]
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
else:
    with open(output_model_path, "rb") as file:
        model = pk.load(file)


model.save_pretrained(model_folder_path)
tokenizer.save_pretrained(model_folder_path)

FileNotFoundError: [Errno 2] No such file or directory: 'data_hpc/italian/model_post_tunning_italian.pkl'

In [None]:
model2 = BertForQuestionAnswering.from_pretrained(model_folder_path)
tokenizer2 = AutoTokenizer.from_pretrained(model_folder_path)

### Creación del pipeline

In [None]:
nlp = pipeline('question-answering', model=model2, tokenizer=tokenizer2)


### Pruebas genéricas

In [None]:
context = "Masks should be used as part of a comprehensive strategy of measures to suppress transmission and save lives; the use of a mask alone is not sufficient to provide an adequate level of protection against COVID-19. If COVID-19 is spreading in your community, stay safe by taking some simple precautions, such as physical distancing, wearing a mask, keeping rooms well ventilated, avoiding crowds, cleaning your hands, and coughing into a bent elbow or tissue. Check local advice where you live and work. Do it all! Make wearing a mask a normal part of being around other people. The appropriate use, storage and cleaning or disposal of masks are essential to make them as effective as possible."

nlp({
    'question': 'What precautions can you take to stay safe?',
    'context': context
})

In [None]:
# Taken from: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/covid-19-vaccines/advice

context = "The world is in the midst of a COVID-19 pandemic. As WHO and partners work together on the response -- tracking the pandemic, advising on critical interventions, distributing vital medical supplies to those in need--- they are racing to develop and deploy safe and effective vaccines. Vaccines save millions of lives each year. Vaccines work by training and preparing the body’s natural defences – the immune system – to recognize and fight off the viruses and bacteria they target. After vaccination, if the body is later exposed to those disease-causing germs, the body is immediately ready to destroy them, preventing illness. There are several safe and effective vaccines that prevent people from getting seriously ill or dying from COVID-19. This is one part of managing COVID-19, in addition to the main preventive measures of staying at least 1 metre away from others, covering a cough or sneeze in your elbow, frequently cleaning your hands, wearing a mask and avoiding poorly ventilated rooms or opening a window."

nlp({
    'question': 'What happens after vaccination?',
    'context': context
})

## Validación Aproximación 2

 - Prueba de ejemplo

In [None]:
n = 3
prediction = get_prediction(n, questions=test_questions, contexts=test_contexts)
#print(prediction)
answer = test_answers[n]["text"]
#print(answer)

em_score = compute_exact_match(prediction, answer)
f1_score = 1 if em_score == 1 else compute_f1(prediction, answer)

print(f"Question: {test_questions[n]}")
print(f"Prediction: {prediction}")
print(f"Answer: {answer}")
print(f"EM: {em_score} \t F1: {f1_score}")

In [None]:
def average_metrics(contexts, answers, questions):
    if len(contexts)!= len(answers) or len(contexts)!= len(questions):
        print("Hay un error en el tamaño de los arreglos")
        return
    test_size = len(contexts)
    em_average = 0
    f1_average = 0
    for n in range(test_size):
        #print(n)
        prediction = get_prediction(n, questions, contexts)
        #print("Prediction: ",prediction)
        answer = answers[n]["text"]
        #print("Answer: ",answer)
        em_score = compute_exact_match(prediction, answer)
        f1_score = 1 if em_score == 1 else compute_f1(prediction, answer)
        #print(f"EM: {em_score} \t F1: {f1_score}")
        em_average += em_score
        f1_average += f1_score
    em_average_tot = em_average/test_size
    f1_average_tot = f1_average/test_size
    return em_average_tot, f1_average_tot

In [None]:
em_average, f1_score = average_metrics( test_contexts, test_answers, test_questions)

In [None]:
print(f"Average F1: {f1_score} \nAverage EM: {em_average}")


