## Rewriting strategy, no classification is used, no memory but trailing

In [None]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import csv

# import sys
# sys.path.append("..")
# from conversationalir.uttclassification.conversation_features import utterance_cosine_similarity_first, utterance_cosine_similarity_previous,is_next_sentence_to_first_neural, is_next_sentence_to_previous_neural 

In [None]:
path ="../CAST_2020/"

utt_file = path+"data/2020_raw.tsv"
test_df = pd.read_csv(utt_file, delimiter="\t")
test_df.head()


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def create_doc(utt):
    return nlp(utt)

In [None]:
third_person_prons = ["he", "she", "it", "they", "him", "her", "them", "his", "her", "its", "their"]

def _rewrite_utt(doc, first_topic="", previous_topic="", context_list=None, trailing=False):
        new_utt = ""
        for token in doc:
            if (token.tag_ == "PRP" or token.tag_ == "PRP$") \
                    and token.text in third_person_prons:
                if previous_topic != "":
                    new_utt += previous_topic + " "
                if first_topic != "":
                    new_utt += first_topic + " "
            else:
                new_utt += token.text + " "

        # TRAILING THE TOPIC
        if trailing:
            if previous_topic.lower() not in new_utt.lower():
                new_utt += previous_topic + " "
            if first_topic.lower() not in new_utt.lower():
                new_utt += first_topic

        return new_utt
    
def _rewrite_utt_new(utt, first_topic="", previous_topic="", context_list=None, trailing=False):
        new_utt = utt
        #print("utt: "+utt)
        #print("first topic: "+first_topic)
        #print("prev topic: "+previous_topic)
        if trailing:
            if previous_topic.lower() not in new_utt.lower():
                new_utt += " "+previous_topic
            if first_topic.lower() not in new_utt.lower():
                new_utt += " " + first_topic

        return new_utt

In [None]:
def _find_cue_topic(doc):
    cue_phrases = ["tell me more about", "tell me about", "what about", "how about"]
    third_person_prons = ["he", "she", "it", "they", "him", "her", "them", "his", "her", "its", "their"]
    current_topic = ""
    pron = False
    for cue in cue_phrases:
        if cue in str(doc).lower():
            # check if pron:
            for token in doc:
                if (token.tag_ == "PRP" or token.tag_ == "PRP$") and \
                        token.text in third_person_prons:
                    pron = True
            if not pron:
                current_topic = str(doc).lower().replace(cue, "").replace(".", "").replace("?", "")
    return current_topic


def _find_topic_new(doc):
    #pos_list = ["nsubj", "dobj", "pobj"]
    topic = ""
    sw_list = ["the", "a", "an", "how", "many", "much", "when", "where", "who", "what", "why"]
    
    for token in doc:
        if token.pos_ not in ["PRON", "AUX", "ADP", "PUNCT"]:
            if token.text.lower() not in sw_list:
                topic += " " + token.text 
                
    # NO TOPIC - trick for "Describe Uranus."
    if topic == "":
        for token in doc:
            if token.pos_ not in ["PUNCT"]:
                if token.text.lower() not in sw_list:
                    topic += token.text + " "

    return topic

def _find_topic(doc):
    #pos_list = ["nsubj", "dobj", "pobj"]
    topic = ""
    
    # NEW FEATURE
    # check if it's a cue topic first
    cue_topic = _find_cue_topic(doc)
    if cue_topic!="":
        return cue_topic
    
    # GET the first topic
    for chunk in doc.noun_chunks:
        #if chunk.root.dep_ in pos_list:
        if chunk.root.pos_ != "PRON":
            topic += " " + chunk.text

    # NO FIRST TOPIC - trick for "Describe Uranus."
    if topic == "":
        for token in doc:
            if token.pos_ not in ["VERB", "PUNCT"]:
                topic += token.text + " "

    return topic


def _find_topic_all(doc):
    pos_list = ["nsubj", "dobj", "pobj"]
    topic = ""
    
    # NEW FEATURE
    # check if it's a cue topic first
    cue_topic = _find_cue_topic(doc)
    if cue_topic!="":
        return cue_topic
    
    # GET the topic
    for chunk in doc.noun_chunks:
        if chunk.root.pos_ != "PRON":
            topic += " " + chunk.text

    # NO TOPIC - trick for "Describe Uranus."
    if topic == "":
        for token in doc:
            if token.pos_ not in ["VERB", "PUNCT"]:
                topic += token.text + " "

    return topic


In [None]:
utt_doc_tmp = create_doc("How about snowboarding ?")
topic_tmp = _find_cue_topic(utt_doc_tmp)

print(topic_tmp)

## Run 4 (CAST 2020): trailing without memory. It always propagates first/current topic (current only if there was a topic shift) and the previous topic. Topics are noun chunks + verbs

In [None]:
enriched_utt_dict = {}

current_topic = {}
utterance_topic = {}

for i in range(0,len(test_df)):
    utt_id = test_df['utt_id'][i]
    prev_utt_id = utt_id.split("_")[0]+"_"+str(int(utt_id.split("_")[1])-1)
    
    if (utt_id.split("_")[1]==str(1)): # first utterace
        first_utt = test_df['utt'][i]
        first_utt_doc = create_doc(first_utt)
        first_topic = _find_topic(first_utt_doc)
        current_topic [utt_id] = first_topic #here first = current topic
        utterance_topic [utt_id] = first_topic
        #propagation
        enriched_utt_dict[utt_id] = first_utt #always SE, so no propagation
       
    else:
       
        utt = test_df['utt'][i]
        utt_doc = create_doc(utt)
        
        #take the utterance topic
        utt_topic = _find_topic(utt_doc)
        utterance_topic [utt_id] = utt_topic
        
        #see if there is also a topic shift
        curr_topic = _find_cue_topic (utt_doc)
        if curr_topic != "":
            current_topic [utt_id] = curr_topic #there is a topic shift, so the current topic is updated
        else:
            current_topic [utt_id] = current_topic[prev_utt_id] #no topic shift, so the current topic is taken from the previous one        
        
        
        #propagation of current and previous topics
        prev_topic_to_propagate = utterance_topic [prev_utt_id]
        curr_topic_to_propagate = current_topic [prev_utt_id]
        
        new_utt = _rewrite_utt_new(utt, first_topic = curr_topic_to_propagate, 
                                   previous_topic = prev_topic_to_propagate, context_list=None, trailing=True)
        enriched_utt_dict[utt_id] = new_utt     
        
   

In [None]:
#write results -> this has been submitted to CAsT 2020 as run 4 to be more different from all context (chosen as run 2)
with open(path+"data/CAST2020_rewritten_utterances_curr_prev_verbs.tsv", "w") as f:
    for k,v in enriched_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

## adding passage of previous utterance

In [None]:
#load original passages
passages_file = path+"data/2020_automatic_eval_withCleanedPassage.tsv"
passages_df = pd.read_csv(passages_file, delimiter="\t", header = None)

print(passages_df.head())

passages = dict(passages_df.values.tolist())


In [None]:

with open(path+"data/CAST2020_rewritten_utterances_curr_prev_verbs_withCleanedPassage.tsv", "w") as f:
    for k,v in enriched_utt_dict.items():
        utt_id = k
        if (utt_id.split("_")[1]==str(1)): # first utterace
            f.write("{}\t{}\n".format(k,v))
        else:   
            utt = v
            prev_utt_id = utt_id.split("_")[0]+"_"+str(int(utt_id.split("_")[1])-1)
            passage = passages[prev_utt_id]
            new_utt = utt + " " +passage
            f.write("{}\t{}\n".format(utt_id,new_utt))

## Always propagate *with memory: topics are extracted from previous enriched utterance*

In [None]:
enriched_utt_dict = {}

for i in range(0,len(test_df)):
    utt_id = test_df['utt_id'][i]
    prev_utt_id = utt_id.split("_")[0]+"_"+str(int(utt_id.split("_")[1])-1)
    
    if (utt_id.split("_")[1]==str(1)): # first utterace
        first_utt = test_df['utt'][i]
        first_utt_doc = create_doc(first_utt)
        first_topic = _find_topic_new(first_utt_doc)
        
        #propagation
        enriched_utt_dict[utt_id] = first_utt # always SE

    else:
        #label = test_df['final_label'][i]
        current_utt = test_df['utt'][i]
        current_utt_doc = create_doc(current_utt)
        
        # get the topic from the enriched previous utterance
        prev_utt_doc = create_doc(enriched_utt_dict[prev_utt_id]) 
        prev_topic = _find_topic_new(prev_utt_doc)
        
        new_utt = _rewrite_utt_new(current_utt, first_topic = "", previous_topic=prev_topic, context_list=None, trailing=True)
        enriched_utt_dict[utt_id] = new_utt

In [None]:
#write results
with open(path+"data/CAST2020_rewritten_utterances_memory.tsv", "w") as f:
    for k,v in enriched_utt_dict.items():
        f.write("{}\t{}\n".format(k,v))

In [None]:
#load original passages
passages_file = path+"data/2020_automatic_eval_withCleanedPassage.tsv"
passages_df = pd.read_csv(passages_file, delimiter="\t", header = None)

print(passages_df.head())

passages = dict(passages_df.values.tolist())


In [None]:
with open(path+"data/CAST2020_rewritten_utterances_memory_withCleanedPassage.tsv", "w") as f:
    for k,v in enriched_utt_dict.items():
        utt_id = k
        if (utt_id.split("_")[1]==str(1)): # first utterace
            f.write("{}\t{}\n".format(k,v))
        else:   
            utt = v
            prev_utt_id = utt_id.split("_")[0]+"_"+str(int(utt_id.split("_")[1])-1)
            passage = passages[prev_utt_id]
            new_utt = utt + " " +passage
            f.write("{}\t{}\n".format(utt_id,new_utt))
            
            