## Rewriting strategy, no classification is used, with memory

In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import csv

# Load 3 types of utterance

In [2]:
automatic_utt_file = "./data/automatic_conv.tsv"
manual_utt_file = "./data/manual_conv.tsv"
raw_utt_file = "./data/raw_conv.tsv"

auto_df = pd.read_csv(automatic_utt_file, delimiter="\t", skip_blank_lines=True, header=None)
manual_df = pd.read_csv(manual_utt_file, delimiter="\t", skip_blank_lines=True, header=None)
raw_df = pd.read_csv(raw_utt_file, delimiter="\t", skip_blank_lines=True, header=None)

In [3]:
auto_df.head(20)

Unnamed: 0,0,1
0,132_1-1,What was Glasgow hosting COP26 about?
1,132_1-3,What are the effects of COP26?
2,132_1-5,Can you be more specific about the future prob...
3,132_1-7,"Woah. They’re not all bad, right?"
4,132_1-1,What was Glasgow hosting COP26 about?
5,132_1-3,What are the effects of COP26?
6,132_2-1,What are the future problems caused by rising ...
7,132_2-3,"Okay, but how does COP26 affect developing cou..."
8,132_2-5,How are developed countries helping with local...
9,132_2-7,Are developing countries meeting the Paris Agr...


In [4]:
auto_dict =  dict(zip(auto_df[0],auto_df[1]))
manual_dict =  dict(zip(manual_df[0],manual_df[1]))
raw_dict =  dict(zip(raw_df[0],raw_df[1]))

# NLP utils and topic extraction

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def create_doc(utt):
    return nlp(utt)

In [6]:
third_person_prons = ["he", "she", "it", "they", "him", "her", "them", "his", "her", "its", "their"]

def _rewrite_utt(doc, first_topic="", previous_topic="", context_list=None, trailing=False):
        new_utt = ""
        for token in doc:
            if (token.tag_ == "PRP" or token.tag_ == "PRP$") \
                    and token.text in third_person_prons:
                if previous_topic != "":
                    new_utt += previous_topic + " "
                if first_topic != "":
                    new_utt += first_topic + " "
            else:
                new_utt += token.text + " "

        # TRAILING THE TOPIC
        if trailing:
            if previous_topic.lower() not in new_utt.lower():
                new_utt += previous_topic + " "
            if first_topic.lower() not in new_utt.lower():
                new_utt += first_topic

        # TRAILING THE CONTEXT
        if context_list is not None:
            new_utt += " ".join(context_list)

        return new_utt

In [7]:
def _find_cue_topic(doc):
    cue_phrases = ["tell me more about", "tell me about", "what about", "how about"]
    third_person_prons = ["he", "she", "it", "they", "him", "her", "them", "his", "her", "its", "their"]
    
    current_topic = ""
    pron = False
    for cue in cue_phrases:
        if cue in str(doc).lower():
            # check if pron:
            for token in doc:
                if (token.tag_ == "PRP" or token.tag_ == "PRP$") and \
                        token.text in third_person_prons:
                    pron = True
            if not pron:
                current_topic = str(doc).lower().replace(cue, "").replace(".", "")
    return current_topic


def _find_topic(doc):
    #pos_list = ["nsubj", "dobj", "pobj"]
    topic = ""
    
    # NEW FEATURE
    # check if it's a cue topic first
    cue_topic = _find_cue_topic(doc)
    if cue_topic!="":
        return cue_topic
    
    # GET the first topic
    for chunk in doc.noun_chunks:
        #if chunk.root.dep_ in pos_list:
        if chunk.root.pos_ != "PRON":
            topic += " " + chunk.text

    # NO FIRST TOPIC - trick for "Describe Uranus."
    if topic == "":
        for token in doc:
            if token.pos_ not in ["VERB", "PUNCT"]:
                topic += token.text + " "

    return topic


def _find_topic_all(doc):
    pos_list = ["nsubj", "dobj", "pobj"]
    topic = ""
    
    # NEW FEATURE
    # check if it's a cue topic first
    cue_topic = _find_cue_topic(doc)
    if cue_topic!="":
        return cue_topic
    
    # GET the first topic
    for chunk in doc.noun_chunks:
        if chunk.root.pos_ != "PRON":
            topic += " " + chunk.text

    # NO FIRST TOPIC - trick for "Describe Uranus."
    if topic == "":
        for token in doc:
            if token.pos_ not in ["VERB", "PUNCT"]:
                topic += token.text + " "

    return topic

## RUN 1) type: automatic (only previous): usare utterance corrente raw ed espanderla con topic della precedente raw

In [8]:
run1_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        first_utt = raw_df[1][i]
        first_utt_doc =create_doc(first_utt)
        first_topic = _find_topic(first_utt_doc)
        run1_rewritten_utt_dict[utt_id] = first_utt

    else:
        
        current_utt = raw_df[1][i]
        current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        prev_utt_doc = create_doc(raw_dict[prev_utt_id]) 
        prev_topic = _find_topic(prev_utt_doc)
        new_utt = _rewrite_utt(current_utt_doc, first_topic = "", previous_topic=prev_topic, context_list=None, trailing=True)
        run1_rewritten_utt_dict[utt_id] = new_utt

In [9]:
# for k,v in run1_rewritten_utt_dict.items():
#     print(k, raw_dict[k],"====>",  v)

In [10]:
#write results
with open("./data/queries.CAST2022.run1.prevraw.test.tsv", "w") as f:
    for k,v in run1_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## 2) type: automatic (previous with history): usare utterance corrente raw ed espanderla con topic della precedente raw già arricchita (dovrebbe propagare i topic)

label is not used, always propagate *with memory: topics are extracted from previous enriched utterance*

In [11]:
run2_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        first_utt = raw_df[1][i]
        first_utt_doc =create_doc(first_utt)
        first_topic = _find_topic(first_utt_doc)
        run2_rewritten_utt_dict[utt_id] = first_utt

    else:
        
        current_utt = raw_df[1][i]
        current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        prev_utt_doc = create_doc(run2_rewritten_utt_dict[prev_utt_id]) 
        prev_topic = _find_topic(prev_utt_doc)
        new_utt = _rewrite_utt(current_utt_doc, first_topic = "", previous_topic=prev_topic, context_list=None, trailing=True)
        run2_rewritten_utt_dict[utt_id] = new_utt

In [13]:
# for k,v in run2_rewritten_utt_dict.items():
#     print(k, raw_dict[k],"====>",  v)

In [14]:
#write results
with open("./data/queries.CAST2022.run2.prevenriched.test.tsv", "w") as f:
    for k,v in run2_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## 3) type: automatic (previous with provided auto):  usare utterance corrente raw ed espanderla con topic della precedente automatically rewritten by CAsT

In [15]:
run3_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        first_utt = raw_df[1][i]
        first_utt_doc =create_doc(first_utt)
        first_topic = _find_topic(first_utt_doc)
        run3_rewritten_utt_dict[utt_id] = first_utt

    else:
        
        current_utt = raw_df[1][i]
        current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        prev_utt_doc = create_doc(auto_dict[prev_utt_id]) 
        prev_topic = _find_topic(prev_utt_doc)
        new_utt = _rewrite_utt(current_utt_doc, first_topic = "", previous_topic=prev_topic, context_list=None, trailing=True)
        run3_rewritten_utt_dict[utt_id] = new_utt

In [16]:
# for k,v in run3_rewritten_utt_dict.items():
#     print(k, raw_dict[k],"====>",auto_dict[k],"====>",  v)

In [17]:
#write results
with open("./data/queries.CAST2022.run3.prevauto.test.tsv", "w") as f:
    for k,v in run3_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## 4) type: manual  (previous with provided manual):  usare utterance corrente raw ed espanderla con topic della precedente manually rewritten by CAsT

In [18]:
run4_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        first_utt = raw_df[1][i]
        first_utt_doc =create_doc(first_utt)
        first_topic = _find_topic(first_utt_doc)
        run4_rewritten_utt_dict[utt_id] = first_utt

    else:
        
        current_utt = raw_df[1][i]
        current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        prev_utt_doc = create_doc(manual_dict[prev_utt_id]) 
        prev_topic = _find_topic(prev_utt_doc)
        new_utt = _rewrite_utt(current_utt_doc, first_topic = "", previous_topic=prev_topic, context_list=None, trailing=True)
        run4_rewritten_utt_dict[utt_id] = new_utt

In [19]:
# for k,v in run4_rewritten_utt_dict.items():
#     print(k, raw_dict[k],"====>", manual_dict[k],"====>",  v)

In [20]:
#write results
with open("./data/queries.CAST2022.run4.prevmanual.test.tsv", "w") as f:
    for k,v in run4_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## 5) raw with prev response, nounchuncks from first sentence
alternatively from entire passage

In [67]:
topics_path = "/data3/muntean/TREC_CAST_2022/data/2022_evaluation_topics_flattened_duplicated_v1.0.json"
topics_df = pd.read_json(topics_path)
topics_df.head()

Unnamed: 0,number,turn
0,132,"[{'number': '1-1', 'utterance': 'I remember Gl..."
1,132,"[{'number': '1-1', 'utterance': 'I remember Gl..."
2,132,"[{'number': '1-1', 'utterance': 'I remember Gl..."
3,133,"[{'number': '1-1', 'utterance': 'I’d like to a..."
4,133,"[{'number': '1-1', 'utterance': 'I’d like to a..."


In [68]:
response_dict = {}
for i in range(len(topics_df)):
    conv_number = topics_df["number"][i]
    conv_tree_list = topics_df["turn"][i]
    for turn in conv_tree_list:
        turn_id = str(conv_number) + "_" + str(turn["number"])
        if "response" in turn: 
            response_dict[turn_id] = turn["response"]
# response_dict

In [71]:
run5_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        # print()
        first_utt_response = response_dict[utt_id].split(".")[0]
        # print(first_utt_response)
        first_utt_response_doc = create_doc(first_utt_response)
        first_response_topic = _find_topic(first_utt_response_doc)
        # print("FT: ", utt_id, raw_df[1][i], first_response_topic)
        run5_rewritten_utt_dict[utt_id] = raw_df[1][i]
        # print()

    else:
        
        current_utt = raw_df[1][i]
        # current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        prev_utt_response_doc = create_doc(response_dict[prev_utt_id].split(".")[0]) 
        prev_response_topic = _find_topic(prev_utt_response_doc)
        # print(raw_df[0][i], raw_df[1][i], prev_response_topic)
        # new_utt = current_utt + " " + first_response_topic + " " + prev_response_topic
        new_utt = current_utt + " " + prev_response_topic
        run5_rewritten_utt_dict[utt_id] = new_utt
        # print()
    

In [74]:
# run5_rewritten_utt_dict

In [73]:
#write results
with open("./data/queries.CAST2022.run5.rawithresponse1sent.test.tsv", "w") as f:
    for k,v in run5_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## 6) raw with prev response, top 5 frequent tokens after stopword removal 

In [129]:
from collections import Counter
def top5_nouns(response):
    docx = nlp(response.lower())

    # Just looking at nouns
    nouns = []
    for token in docx:
        if token.is_stop != True and token.is_punct != True and token.pos_ == 'NOUN':
            nouns.append(str(token))
    top5 = Counter(nouns).most_common(5)
    # print(top10)
    return " ".join([str(x) for x,y in top5])

In [130]:
run6_rewritten_utt_dict = {}

for i in range(0,len(raw_df)):
    utt_id = raw_df[0][i]
    if (utt_id.split("_")[1]=="1-1"): # first utterace
        # print()
        first_utt_response = response_dict[utt_id]
        # print(first_utt_response)
        # first_utt_response_doc = create_doc(first_utt_response)
        # first_response_topic = _find_topic_all(first_utt_response_doc)
        first_response_top5_topic = top5_nouns(first_utt_response)
        # print("FT: ", utt_id, raw_df[1][i], first_response_topic)
        run6_rewritten_utt_dict[utt_id] = raw_df[1][i]
        # print()

    else:
        
        current_utt = raw_df[1][i]
        # current_utt_doc = create_doc(current_utt)
        
        # get the previous topic
        prev_utt_id = raw_df[0][i-1]
        # prev_utt_response_doc = create_doc(response_dict[prev_utt_id]) 
        # prev_response_topic = _find_topic_all(prev_utt_response_doc)
        prev_response_top5_topic = top5_nouns(response_dict[prev_utt_id])
        # print(raw_df[0][i], raw_df[1][i], prev_response_topic)
        # new_utt = current_utt + " " + first_response_topic + " " + prev_response_topic
        new_utt = current_utt + " " + prev_response_top5_topic
        run6_rewritten_utt_dict[utt_id] = new_utt

In [132]:
# run6_rewritten_utt_dict

In [133]:
#write results
with open("./data/queries.CAST2022.run6.rawithresponsetop5tokens.test.tsv", "w") as f:
    for k,v in run6_rewritten_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))