# Drive mounting and imports

In [1]:
!pip install google-colab



In [2]:
# Colab Setup
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# After downloading the shared starting point folder as a Zip
# Unzip it and re-upload it to a location on your GDrive

# This command copies the contents from the folder you uploaded to GDrive, to the colab working dir
!cp -r /content/drive/My\ Drive/ProjectoRI2020 /content

# Add working dir to the sys path, so that we can find the aux python files when running the Notebook
import sys
if not '/content/ProjectoRI2020' in sys.path:
  sys.path += ['/content/ProjectoRI2020']

# Finally install required dependencies to run the notebook
!pip install elasticsearch
!pip install bert-serving-client
!pip install transformers

Mounted at /content/drive
Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/86/74/054342aa07121f7c82e30ae63e3f257a793a69ddc11c5065449252dcd8af/elasticsearch-7.10.1-py2.py3-none-any.whl (322kB)
[K     |████████████████████████████████| 327kB 15.5MB/s 
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.10.1
Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 13.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.

In [3]:
# Imports
import TRECCASTeval as trec
import ElasticSearchSimpleAPI as es
import pandas as pd
import pickle
from collections import defaultdict  
import spacy

# Query Re-Write Method 1: Concatenate the first user utterance to each turn of the dialog.

In [4]:
# Concatenate the first user utterance to each turn of the dialog.
elastic = es.ESSimpleAPI()
test_bed = trec.ConvSearchEvaluation()
test_topics = test_bed.test_topics
utterances={}
conversations ={}
for i in test_topics:
    utterances.update(i)
    concatenating_strings = [utterances['turn'][0]['raw_utterance']]
    for i in range(len(utterances['turn'])-1):
      first_utterance = utterances['turn'][0]['raw_utterance']
      next_utterance = utterances['turn'][i+1]['raw_utterance']
      join_utterance = str(first_utterance) + ", "+ str(next_utterance)
      concatenating_strings.append(join_utterance)
    conversations[utterances['number']] = concatenating_strings

conversations

# Save to pickle
filehandler = open('/content/ProjectoRI2020/conversations.pkl', 'wb') 
pickle.dump(conversations, filehandler)
filehandler.close()

# Query Re-Write Method 2: Concatenate the entities of the first user utterance to each turn of the dialog.

In [5]:
##Entidades da primeira query e depois podemos ir acrescentando das outras como for melhor para os resultados
nlp = spacy.load('en_core_web_sm')
conversations_entities ={}
test_topics = test_bed.test_topics
res = defaultdict(list) 
#Get only the first conversational turn
{res[key].append(sub[key]) for sub in test_topics for key in sub}  
utterances = {}
queries_entities = []

#getting entities
for i in range(len(res['turn'])):
  utterance = res['turn'][i][0]['raw_utterance']
  entities = nlp(utterance).ents
  entities = [(e.text, e.start_char, e.end_char, e.label_) for e in entities]
  entities = [tuple(element for element in sub if type(element) != int) for sub in entities] 
  queries_entities.append(entities) 

w=0
#concatenation first conversational turn entities with the following 
for j in test_topics:
  utterances.update(j)
  concatenating_entities = []
  for k in range(len(utterances['turn'])):
      next_utterance = utterances['turn'][k]['raw_utterance']
      #If we want to remove the empty ones
      """if queries_entities[w] == []:
         concatenating_entities.append(next_utterance)
      else:"""
      join_utterance = next_utterance + ", " + str(queries_entities[w])
      concatenating_entities.append(join_utterance)
  conversations_entities[utterances['number']] = concatenating_entities

  w+=1

conversations_entities

# Save to pickle
filehandler = open('/content/ProjectoRI2020/pickles/conversations_entities.pkl', 'wb') 
pickle.dump(conversations_entities, filehandler)
filehandler.close()

# Query Re-Write Method 3: T5 model.

In [6]:
%tensorflow_version 2.x
!pip install t5==0.5.0
import tensorflow as tf
import tensorflow_text

Collecting t5==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/93/e25d0043ece218a94d14f710c553bbcc7974d2ff169b4aa8f8d1423dca95/t5-0.5.0-py3-none-any.whl (142kB)
[K     |██▎                             | 10kB 25.9MB/s eta 0:00:01[K     |████▋                           | 20kB 31.7MB/s eta 0:00:01[K     |██████▉                         | 30kB 36.9MB/s eta 0:00:01[K     |█████████▏                      | 40kB 31.3MB/s eta 0:00:01[K     |███████████▌                    | 51kB 32.2MB/s eta 0:00:01[K     |█████████████▊                  | 61kB 35.1MB/s eta 0:00:01[K     |████████████████                | 71kB 26.5MB/s eta 0:00:01[K     |██████████████████▍             | 81kB 24.8MB/s eta 0:00:01[K     |████████████████████▋           | 92kB 26.4MB/s eta 0:00:01[K     |███████████████████████         | 102kB 23.1MB/s eta 0:00:01[K     |█████████████████████████▏      | 112kB 23.1MB/s eta 0:00:01[K     |███████████████████████████▌    | 122kB 23.1MB/

In [10]:
!rm -rf /content/t5-canard*
!cp -r /content/ProjectoRI2020/t5-canard.zip /content/t5-canard.zip
!unzip /content/ProjectoRI2020/t5-canard.zip

Archive:  /content/ProjectoRI2020/t5-canard.zip
   creating: t5-canard/
  inflating: t5-canard/saved_model.pb  
   creating: t5-canard/variables/
 extracting: t5-canard/variables/variables.data-00000-of-00002  
  inflating: t5-canard/variables/variables.data-00001-of-00002  
  inflating: t5-canard/variables/variables.index  


In [11]:
class QueryRewriterT5:
  def __init__(self, model_path="/content/t5-canard"):
    """
      Loads T5 model for prediction
      Returns the model
    """
    if tf.executing_eagerly():
        print("Loading SavedModel in eager mode.")
        imported = tf.saved_model.load(model_path, ["serve"])
        self.t5_model = lambda x: imported.signatures['serving_default'](tf.constant(x))['outputs'].numpy()
    else:
        print("Loading SavedModel in tf 1.x graph mode.")
        tf.compat.v1.reset_default_graph()
        sess = tf.compat.v1.Session()
        meta_graph_def = tf.compat.v1.saved_model.load(sess, ["serve"], model_path)
        signature_def = meta_graph_def.signature_def["serving_default"]
        self.t5_model = lambda x: sess.run(
            fetches=signature_def.outputs["outputs"].name,
            feed_dict={signature_def.inputs["input"].name: x}
        )
  
  """
    query: str - the query string to be rewritten using T5
    ctx_list: list - A list of strings containing the turns or text to give context to T5
    Returns a string with the rewritten query
  """
  def rewrite_query_with_T5(self, _curr_query, _ctx_list):
    _t5_query = '{} [CTX] '.format(_curr_query) + ' [TURN] '.join(_ctx_list)
    print("Query and context: {}".format(_t5_query))
    return self.t5_model([_t5_query])[0].decode('utf-8')

  """
    queries_list: list - A list of strings containing the raw utterances ordered from first to last
    Returns a list of strings with the rewritten queries
  """
  def rewrite_dialog_with_T5(self, _queries_list):
    _rewritten_queries_list=[]
    for i in range(len(_queries_list)):
      _current_query = _queries_list[i]
      _rewritten_query = self.rewrite_query_with_T5(_current_query, _queries_list[:i])
      print("Rewritten query: {}\n".format(_rewritten_query))
      _rewritten_queries_list.append(_rewritten_query)
    return _rewritten_queries_list

In [15]:
# Load model
rewriter = QueryRewriterT5('/content/ProjectoRI2020/t5-canard/')

Loading SavedModel in eager mode.


OSError: ignored

In [16]:
rewriten_test_bed = {}
for i, topic in enumerate(test_bed.test_topics):
    conv_id = topic['number']
    if conv_id not in (31,32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue

    print(f"### Topic {conv_id} in Test Bed ###")  
    print(test_bed.test_topics[i]['turn'])
    utterances = [t['raw_utterance'].strip() for t in test_bed.test_topics[i]['turn']]

    rewriten = rewriter.rewrite_dialog_with_T5(utterances)
    rewriten_test_bed[conv_id] = rewriten

rewriten_test_bed

# Save to pickle
filehandler = open('/content/ProjectoRI2020/pickles/t5_queries.pkl', 'wb') 
pickle.dump(rewriten_test_bed, filehandler)
filehandler.close()

### Topic 31 in Test Bed ###
[{'number': 1, 'raw_utterance': 'What is throat cancer?'}, {'number': 2, 'raw_utterance': 'Is it treatable?'}, {'number': 3, 'raw_utterance': 'Tell me about lung cancer.'}, {'number': 4, 'raw_utterance': 'What are its symptoms? '}, {'number': 5, 'raw_utterance': 'Can it spread to the throat?'}, {'number': 6, 'raw_utterance': 'What causes throat cancer?'}, {'number': 7, 'raw_utterance': 'What is the first sign of it?'}, {'number': 8, 'raw_utterance': 'Is it the same as esophageal cancer?'}, {'number': 9, 'raw_utterance': "What's the difference in their symptoms?"}]


NameError: ignored