#Evaluation of Re-written queries



In this notebook, we're going to load the raw and re-written conversations, and evaluate them with both LMD and the respective BERT re-ranking:

1.   Concatenate the first user utterance to each turn of the dialog.
2.   Concatenate the entities of the first user utterance to each turn of
the dialog.
3.   T5 model.
4.   Raw data.


#### 1. Mount drive and load necessary libraries

In [None]:
!pip install google-colab



In [None]:
# Colab Setup
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# After downloading the shared starting point folder as a Zip
# Unzip it and re-upload it to a location on your GDrive

# This command copies the contents from the folder you uploaded to GDrive, to the colab working dir
!cp -r /content/drive/My\ Drive/ProjectoRI2020 /content

# Add working dir to the sys path, so that we can find the aux python files when running the Notebook
import sys
if not '/content/ProjectoRI2020' in sys.path:
  sys.path += ['/content/ProjectoRI2020']

# Finally install required dependencies to run the notebook
!pip install elasticsearch
!pip install bert-serving-client
!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Get the interactive Tools for Matplotlib
%matplotlib notebook
%matplotlib inline

# Imports
import TRECCASTeval as trec
from ModelResults import get_LMD_Results
from visualMetrics import *
import ElasticSearchSimpleAPI as es
import pandas as pd
import pprint
elastic = es.ESSimpleAPI()

import numpy as np
from pprint import pprint 


import matplotlib.pyplot as plt
plt.style.use('ggplot')

from transformers import BertTokenizer, BertModel
import torch

from transformers import BertTokenizerFast, BertModel

from bert_serving.client import BertClient

import pickle 
import os

In [None]:
%tensorflow_version 2.x
!pip install t5==0.5.0
import tensorflow as tf
import tensorflow_text



#### 2. Load re-written conversations and Logistic regression models

In [None]:
# Load raw queries
filehandler = open('/content/ProjectoRI2020/pickles/raw_data.pkl', 'rb') 
raw_data = pickle.load(filehandler)
filehandler.close()

# Load re-written queries from method (1)
filehandler = open('/content/ProjectoRI2020/pickles/conversations.pkl', 'rb') 
rw_test_1stTurn = pickle.load(filehandler)
filehandler.close()

# Load re-written queries from method (2)
filehandler = open('/content/ProjectoRI2020/pickles/conversations_entities.pkl', 'rb') 
rw_test_enteties = pickle.load(filehandler)
filehandler.close()

# Load re-written queries from method (3)
filehandler = open('/content/ProjectoRI2020/pickles/t5_queries.pkl', 'rb') 
rw_test_T5 = pickle.load(filehandler)
filehandler.close()

# Load original logistic regression model for BERT
filehandler = open('/content/ProjectoRI2020/pickles/bert_classifier2.pkl', 'rb') 
reg_orig = pickle.load(filehandler)
filehandler.close()

#### 3. Evaluate re-written queries

### Helpers for Entities

In [None]:
from pandas.io.json import json_normalize


#Search with Entities
def search_with_boosted_entities(query_text, entities_list, boost_list, numDocs=10):
  assert len(entities_list) == len(boost_list)
  assert len(entities_list) > 0
  assert isinstance(entities_list[0], str)
  assert isinstance(boost_list[0], (int,float))

  entities_query_template = {"query": {"bool": {"should": [{"match": {"body": query_text}}]}}}
  boost_query_term_template = {"match": {"body": {"query": None, "boost": None}}}

  for i in range(len(entities_list)):
    entity = entities_list[i]
    boost = boost_list[i]
    boost_query_term_template['match']['body']['query'] = entity
    boost_query_term_template['match']['body']['boost'] = boost
    entities_query_template["query"]["bool"]["should"].append(dict(boost_query_term_template))
  
  result = elastic.client.search(index='msmarco', body=entities_query_template, size=numDocs)
  return json_normalize(result["hits"]["hits"])

In [None]:
from collections import defaultdict  
import spacy

elastic = es.ESSimpleAPI()
test_bed = trec.ConvSearchEvaluation()
nlp = spacy.load('en_core_web_sm')
conversations_entities ={}
test_topics = test_bed.test_topics
res = defaultdict(list) 
#Get only the first conversational turn
{res[key].append(sub[key]) for sub in test_topics for key in sub}  
utterances = {}
queries_entities = []

#getting entities
for i in range(len(res['turn'])):
  utterance = res['turn'][i][0]['raw_utterance']
  entities = nlp(utterance).ents
  entities = [(e.text, e.start_char, e.end_char, e.label_) for e in entities]
  entities = [tuple(element for element in sub if type(element) != int) for sub in entities] 
  queries_entities.append(entities) 


### Evaluate results

In [None]:
# bert_model_name = 'bert-base-cased'
bert_model_name = 'nboost/pt-bert-base-uncased-msmarco'
CLS_token = "[CLS]"
SEP_token = "[SEP]"

device = torch.device("cuda")
model = BertModel.from_pretrained('nboost/pt-bert-base-uncased-msmarco', return_dict=True)
model = model.to(device)

max_length = 512,  # maximum length of a sentence
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)

In [None]:
max_length = 512,  # maximum length of a sentence
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
def convert_to_bert_input(sentences, max_seq_length, tokenizer, add_cls, return_tensors="pt"):

    # Tokenize both sentences
    sentences_tokens = [tokenizer.tokenize(s + SEP_token) for s in sentences]
    # Combine sentences tokens on a single list
    tokens = sum(sentences_tokens, [])
    
    if add_cls:
        tokens = [CLS_token] + tokens
    # Create Token type ids tensors
    token_type_ids = [[i]*len(s) for i, s in enumerate(sentences_tokens)] # Acount for the SEP token we've just added
    token_type_ids = [0] + sum(token_type_ids, []) # CLS + The whole token_type_ids flattened

    # Remove tokens if max_seq_length is exceeded
    # Account for [CLS] and [SEP] with "- 3"
    if len(tokens) > max_seq_length - 3:
        tokens = tokens[:max_seq_length - 4] + [tokens[-1]] # keep SEP token
        token_type_ids = token_type_ids[:max_seq_length - 3]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Create Attention mask tensor -> Which tokens should BERT consider
    attention_mask = [1]*len(tokens)
    
    if return_tensors == "pt":
        input_ids = torch.tensor([input_ids], dtype=torch.long, device=device)
        token_type_ids = torch.tensor([token_type_ids], dtype=torch.long, device=device)
        attention_mask = torch.tensor([attention_mask], dtype=torch.long, device=device)
    
    data = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask
    }
    return data

In [None]:
def evaluate_query_BERT(path, rw_test, lr_model, label):
  test_bed = trec.ConvSearchEvaluation()

  print('======= evaluate_query_BERT =======')
  for topic in rw_test.keys():

      stats = pd.DataFrame(columns=['p10', 'recall', 'ap', 'ndcg5'])
      for turn_id, utterance in enumerate(rw_test[topic]):
        try:
          
          if turn_id > 8:
            continue

          print(topic)
          print(turn_id + 1, utterance)
          data = pd.read_csv(path + f'{topic}_{turn_id+1}.csv')
          # Generate bert embeddings
          inputs_qa = data.apply(lambda x: convert_to_bert_input([\
              utterance, x['_source.body']], 512, tokenizer, True, return_tensors="pt"), axis=1)
          # Use embeddings in pre-trained bert model
          outputs_qa = inputs_qa.copy()
          for i in range(inputs_qa.shape[0]):
              outputs_qa[i] = model(**inputs_qa[i]).last_hidden_state[:,0,:].detach().cpu().numpy()[0]

          features = np.zeros((outputs_qa.shape[0], 768))
          for i in range(outputs_qa.shape[0]):
            features[i] = outputs_qa[i]
          
          # Predict probabilities using linear regression model
          predicted_probabilities = lr_model.predict_proba(features)
          data['logistic regression'] = predicted_probabilities[:,1]
          # Save metrics to disk
          data = data.sort_values(["logistic regression"], ascending = [False])
          data.to_csv(path + 'BERT-'+label+f'_{topic}_{turn_id+1}.csv')
          [p10, recall, ap, ndcg5] = test_bed.eval(data[['_id','logistic regression']], f'{topic}_{turn_id+1}')
          stats = stats.append({'p10': p10, 'recall': recall, 'ap': ap, 'ndcg5': ndcg5}, ignore_index=True)

        except FileNotFoundError:
          print('a File not found!')
          pass
      stats.to_csv(path + f'stats_{topic}_BERT_'+label+'.csv')
    

In [None]:
def evaluate_query_LMD(path, rw_test, label):
  test_bed = trec.ConvSearchEvaluation()

  print('======= evaluate_query_LMD =======')
  #if not '31_1.csv' in os.listdir(path):
  for i, topic in enumerate(test_bed.test_topics):
    conv_id = topic['number']
    if conv_id not in (31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue

    stats = pd.DataFrame(columns=['p10', 'recall', 'ap', 'ndcg5'])
    for id, turn in enumerate(topic['turn']):
      try:
        turn_id = turn['number']
        topic_turn_id = '%d_%d'% (conv_id, turn_id)

        if turn_id > 8:
          continue

        print(topic_turn_id)
        aux = test_bed.test_relevance_judgments.loc[test_bed.test_relevance_judgments['topic_turn_id'] == (topic_turn_id)]
        num_rel = aux.loc[aux['rel'] != 0]['docid'].count()

        if num_rel == 0:
            continue


        # IF ENTITIES, THEN WE HAVE TO USE A SPECIAL METHOD!
        if label == 'enteties':
          entities = str(queries_entities[0])
          boost_list = [1.0] * len(entities)
          query_text = conversations_entities[conv_id][id] 
          if num_rel == 0:
              continue
          if entities != None: 
            result = search_with_boosted_entities(query_text,entities, boost_list)
          else:
            result = elastic.search_body(query=conversations_entities[conv_id][id], numDocs = 1000)
        else:
          result = elastic.search_body(query=rw_test[conv_id][id], numDocs = 1000)

        # Compute stats
        [p10, recall, ap, ndcg5] = test_bed.eval(result[['_id','_score']], topic_turn_id)
        stats = stats.append({'p10': p10, 'recall': recall, 'ap': ap, 'ndcg5': ndcg5}, ignore_index=True)
        result.to_csv(path + 'LMD-'+label+f'_{topic_turn_id}.csv')
      except:
        pass
      stats.to_csv(path+f'stats_{conv_id}_LMD_'+label+'.csv')

In [None]:
# Run query evaluation
#
path = '/content/drive/MyDrive/ProjectoRI2020/results/'

# LMD
# Raw BERT logistic regression model
evaluate_query_LMD(path, raw_data, 'raw')
evaluate_query_LMD(path, rw_test_1stTurn, '1stTurn')
evaluate_query_LMD(path, rw_test_enteties, 'enteties')
evaluate_query_LMD(path, rw_test_T5, 'T5')

# BERT
# Raw BERT logistic regression model
evaluate_query_BERT(path, raw_data, reg_orig, 'raw-regOri')
evaluate_query_BERT(path, rw_test_1stTurn, reg_orig, '1stTurn-regOri')
evaluate_query_BERT(path, rw_test_enteties, reg_orig, 'enteties-regOri')
evaluate_query_BERT(path, rw_test_T5, reg_orig, 'T5-regOri')

31_1
31_2
31_3
31_4
31_5
31_6
31_7
31_8
32_1
32_2
32_3
32_4
32_5
32_6
32_7
32_8
33_1
33_2
33_3
33_4
33_5
33_6
33_7
33_8
34_1
34_2
34_3
34_4
34_5
