<a href="https://colab.research.google.com/github/e184633/nlu_paper/blob/main/Reproduce_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of a QA System

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb)

To be able to make a statement about the performance of a question-answering system, it is important to evalute it. Furthermore, evaluation allows to determine which parts of the system can be improved.

In [None]:
# Install the latest release of Haystack in your own environment 
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install grpcio-tools==1.34.1
!pip install git+https://github.com/deepset-ai/haystack.git
!pip install datasets
# If you run this notebook on Google Colab, you might need to
# restart the runtime after installing haystack.
!pip install "ray[default]"

In [None]:
from haystack.modeling.utils import initialize_device_settings

device, n_gpu = initialize_device_settings(use_cuda=True)

In [None]:

!nvidia-smi

In [None]:
from transformers import logging
logging.set_verbosity_error()

In [None]:

from google.colab import files

# sas_squad.sort_values('sas').to_csv('squad_sas_score.csv') 
# files.download('squad_sas_score.csv')

In [None]:
# url = 'https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset/dev-v2.0.json'


## Bi-encoder

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

In [None]:
!ls data/data/

## Data Import

In [None]:
from haystack.preprocessor.utils import fetch_archive_from_http

# Download evaluation data, which is a subset of 
# Natural Questions development set containing 50 documents
doc_dir = "data"
s3_url = " https://semantic-answer-similarity.s3.amazonaws.com/data.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

In [None]:
import pandas as pd
squad = pd.read_csv('data/data/SQuAD_SAS.csv')
nq_open = pd.read_csv('data/data/NQ-open_SAS.csv')
german_quad = pd.read_csv('data/data/GermanQuAD_SAS.csv')

In [None]:
dataset_to_bi_encoder_similarity_scores = {}
for data, name in zip([squad, nq_open, german_squad], 
                      ('squad', 'nq-open', 'german_squad')):
    scores_bi_encoder = data.copy()
    from sentence_transformers import util
    #Compute embedding for both lists
    embeddings1 = model.encode(data['answer1'], convert_to_tensor=True)
    embeddings2 = model.encode(data['answer2'], convert_to_tensor=True)

    #Compute cosine-similarits
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    scores_bi_encoder['bi_encoder'] = pd.Series()
    for i in range(len(data['answer1'])):
        scores_bi_encoder['bi_encoder'][i] = cosine_scores[i][i].item()
    scores_bi_encoder.sort_values('bi_encoder').to_csv(f'{name}_bi_encoder_score.csv') 
    files.download(f'{name}_bi_encoder_score.csv')
    dataset_to_bi_encoder_similarity_scores[name] = scores_bi_encoder

In [None]:

compute_f1(str(4), str(5))

In [None]:


from haystack.modeling.evaluation.squad_evaluation import compute_f1

In [None]:

dataset_to_bi_encoder_similarity_scores = {}
for data, name in zip([squad, nq_open, german_squad], 
                      ('squad', 'nq-open', 'german_squad')):
    scores_bi_encoder = data.copy()
    from sentence_transformers import util
    #Compute embedding for both lists
    embeddings1 = model.encode(data['answer1'], convert_to_tensor=True)
    embeddings2 = model.encode(data['answer2'], convert_to_tensor=True)

    #Compute cosine-similarits
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    scores_bi_encoder['bi_encoder'] = pd.Series()
    scores_bi_encoder['f1'] = pd.Series()
    for i in range(len(data['answer1'])):
        scores_bi_encoder['bi_encoder'][i] = cosine_scores[i][i].item()
    
    scores_bi_encoder['f1'] = scores_bi_encoder.apply(
        lambda x: compute_f1(str(x.answer1), str(x.answer2)), axis=1)
    scores_bi_encoder.sort_values('bi_encoder').to_csv(f'{name}_bi_encoder_score.csv') 
    files.download(f'{name}_bi_encoder_score.csv')
    dataset_to_bi_encoder_similarity_scores[name] = scores_bi_encoder


In [None]:

# temp['f1'] = temp.apply(lambda x: compute_f1(str(x.answer1), str(x.answer2)), axis=1)

In [None]:

from transformers import AutoConfig
model_names = ['sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 
               'cross-encoder/stsb-roberta-large', 
               'deepset/gbert-base',
               'deepset/gbert-large-sts',
               'T-Systems-onsite/cross-en-de-roberta-sentence-transformer'
              ]
configs = [AutoConfig.from_pretrained(model_name) for model_name in model_names]
[arch.endswith('ForSequenceClassification') for config in configs for arch in config.architectures]

## Cross-encoder

In [None]:
from sentence_transformers import CrossEncoder
model_cross_encoder = CrossEncoder('deepset/gbert-large-sts')


In [None]:

cross_encoder_sas = squad.copy()

In [None]:

series = _

In [None]:

sas_squad = squad.copy()

In [None]:

sas_squad['sas'] = series

In [None]:

sas_squad.sort_values('sas')

In [None]:
from google.colab import files

sas_squad.sort_values('sas').to_csv('squad_sas_score.csv') 
files.download('squad_sas_score.csv')

In [None]:
from sklearn.metrics import f1_score

In [None]:

output = _

In [None]:

series_german = pd.Series(
    [model_cross_encoder.predict([str(german_squad.answer1.values[i]), 
                                  str(german_squad.answer2.values[i])], 
                                 show_progress_bar=False) 
                                 for i in range(len(german_squad))])

In [None]:
nq_open['f1'] = nq_open.apply(lambda x: compute_f1(x.answer1, x.answer2), axis=1)

In [None]:

german_quad = german_squad.copy()
german_quad['sas'] = series_german

german_quad.sort_values('sas').to_csv('german_quad_sas.csv') 
files.download('german_quad_sas.csv')

In [None]:


series_nq_open = pd.Series(
    [model_cross_encoder.predict([str(nq_open.answer1.values[i]), 
                                  str(nq_open.answer2.values[i])], 
                                 show_progress_bar=False) 
                                 for i in range(len(nq_open))])

nq_open_sas = nq_open.copy()
nq_open_sas['sas'] = series_nq_open
nq_open_sas.sort_values('sas').to_csv('nq_open_sas.csv') 
files.download('nq_open_sas.csv')

In [None]:
# series_german_quad = pd.Series(
#     [model_cross_encoder.predict([german_squad.answer1.values[i], 
#                                   german_squad.answer2.values[i]]) 
#                                  for i in range(len(german_squad))])

In [None]:
model_cross_encoder.predict(["sentence one.","sentence two."]) 

In [None]:
model_cross_encoder.predict()

In [None]:

data.answer1.values[5:6]

In [None]:

data.answer2.values[5:6]

In [None]:
model_cross_encoder.predict([data.answer1.values[5:6], data.answer2.values[5:6]])

# Bert Score

In [None]:
!pip install bert_score

In [None]:
from bert_score import score


In [None]:

help(score)

In [None]:

data

## Vanilla BERTScore

In [None]:
series_bert_score = []
for i in range(len(squad)):
    _, _, bertscore = score(
        [squad.answer1[i]], 
        [squad.answer2[i]], 
        model_type='bert-base-uncased', 
        num_layers=2)
    series_bert_score.append(bertscore)

In [None]:
len(series_bert_score)

In [None]:
squad_bs = squad.copy()
squad_bs['bert_score'] = [score.item() for score in series_bert_score]

squad_bs.sort_values('bert_score').to_csv('squad_bert_score.csv') 
files.download('squad_bert_score.csv')

In [None]:
german_series_bert_score = []
for i in range(len(german_quad)):
    _, _, bertscore = score(
        [str(german_quad.answer1[i])], 
        [str(german_quad.answer2[i])], 
        model_type='deepset/gelectra-base', 
        num_layers=2)
    german_series_bert_score.append(bertscore)

In [None]:
german_quad_bert = german_quad.copy()
german_quad_bert['bert_score'] = [score.item() for score in german_series_bert_score]

german_quad_bert.sort_values('bert_score').to_csv('german_quad_bert.csv') 
files.download('german_quad_bert.csv')

In [None]:

nq_open_sas

In [None]:
nq_open_series_bert_score = []
for i in range(len(nq_open)):
    _, _, bertscore = score(
        [str(nq_open.answer1[i])], 
        [str(nq_open.answer2[i])], 
        model_type='bert-base-uncased', 
        num_layers=2)
    nq_open_series_bert_score.append(bertscore.item())
nq_open_bert = nq_open_sas.copy()
nq_open_bert['bert_score'] = nq_open_series_bert_score

nq_open_bert.sort_values('bert_score').to_csv('nq_open_bert.csv') 
files.download('nq_open_bert.csv')

## BERTScore Trained 

In [None]:
nq_open_bert_trained = []
for i in range(len(nq_open)):
    _, _, bertscore = score(
        [str(nq_open.answer1[i])], 
        [str(nq_open.answer2[i])], 
        model_type='bert-base-uncased')
    nq_open_bert_trained.append(bertscore.item())
nq_open_bert_trained_df = nq_open_bert.copy()
nq_open_bert_trained_df['bert_score_prime'] = nq_open_series_bert_score

nq_open_bert_trained_df.sort_values('bert_score_prime').to_csv('nq_open_bert_trained.csv') 
files.download('nq_open_bert_trained.csv')

In [None]:
from datasets import load_metric
metric = load_metric("bertscore")

In [None]:
score()

In [None]:
metric.compute(
        predictions=[str(german_quad.answer1[0])], 
        references=[str(german_quad.answer2[0])], 
        model_type='T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
        # model_type='deepset/gelectra-base')


In [None]:
# german_quad_bert_trained = []
# for i in range(len(german_quad)):
#     _, _, bertscore = score(
#         [str(german_quad.answer1[i])], 
#         [str(german_quad.answer2[i])], 
#         # model_type='T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
#         # model_type='deepset/gelectra-base')
#         lang='de', 
#         verbose=False)
#     german_quad_bert_trained.append(bertscore.item())
german_quad_bert_trained_df = german_quad.copy()
german_quad_bert_trained_df['bert_score_prime'] = german_quad_bert_trained

german_quad_bert_trained_df.sort_values('bert_score_prime').to_csv('german_quad_bert_trained.csv') 
files.download('german_quad_bert_trained.csv')