In [1]:
import pandas as pd
import numpy as np
from rouge import Rouge
rouge = Rouge()
import nltk
# nltk.download('punct') 
import re
DATA_PATH = './tweet_sum_data_files/dialogs_data_with_summaries.xlsx'

### Считывание данных

In [2]:
summaries_df = pd.read_excel(DATA_PATH)
test_summaries_df = summaries_df[summaries_df['data_split'] == 'test']
dialog_texts = test_summaries_df['text'].values.tolist()
abstractive_summaries = test_summaries_df['abstractive_summaries'].values.tolist()
abstractive_summaries = list(map(eval, abstractive_summaries))

### Эвристика: взять из текста первые два предложения клиента и первые два предложения оператора поддержки. 
#### Составляем из них новый текст, взяв эти предложения в порядке, в котором они были в исходном тексте.

In [3]:
def heuristic_twofirst(dialog) -> str: 
    max_count = 2
    result = []

    replics = dialog_split(dialog)
    customer_count, agent_count = 0, 0
    for replic in replics: 
        if customer_count == max_count and agent_count == max_count: break 

        if replic.lstrip().startswith('Customer:'): 
            for sent in nltk.sent_tokenize(replic):
                if customer_count != max_count: 
                    result.append(sent)
                    customer_count += 1
        elif replic.lstrip().startswith('Agent:'): 
            for sent in nltk.sent_tokenize(replic):
                if agent_count != max_count: 
                    result.append(sent)
                    agent_count += 1
        else: 
            # raise ValueError(f"Something is wrong with replic:\n {replic}")
            # print(f"Warning: something is wrong with replic:\n '{replic}'\nin dialog \n{dialog[:200]}\n...\n")
            pass
    # print(result)        
    return ''.join(result)

In [4]:
from random import randrange

def dialog_split(dialog): 
    replics = dialog.split('\n')
    return replics 
    
def replics_of(replics, type='Customer:'):
    # Returns replics along with their index in initial replic list 
    return list(filter(
        lambda r: r[1].lstrip().startswith(type), 
        enumerate(replics)
    ))

def replics2sents(enumerated_replics):
    # Returns sentences along with (index of the replic it belongs to, index inside replic)
    res = []
    for i, replic in enumerated_replics: 
        sents = nltk.sent_tokenize(replic)
        
        sents = filter(lambda s: len(s) > 1, sents) # Таким образом уберем предложения состоящие из одного эмодзи. 
        # Возможно, неудачно, что обработка текста оказалась в двух разных местах. 
        
        res.extend(
            ((i, j), sent) for j, sent in enumerate(sents)
        )
    return res 

### Эвристика: взять из текста случайные два предложения клиента и случайные два предложения оператора поддержки. 
#### Составляем из них новый текст, взяв эти предложения в порядке, в котором они были в исходном тексте.

In [5]:
def heuristic_tworand(dialog) -> str: 
    res = []
    replics = dialog_split(dialog)
    sents1 = replics2sents(replics_of(replics, 'Customer'))
    sents2 = replics2sents(replics_of(replics, 'Agent'))
    # print(sents1)
    # print(sents2)

    # Customer
    c1 = randrange(0, len(sents1))
    c2 = randrange(0, len(sents1))
    while c2 == c1: 
        c2 = randrange(0, len(sents1))
    c1, c2 = sents1[c1], sents1[c2]
    # print(len(sents1))
    # print(c1, c2)

    # Agent
    a1 = randrange(0, len(sents2))
    a2 = randrange(0, len(sents2))
    while a2 == a1: 
        a2 = randrange(0, len(sents2))
    a1, a2 = sents2[a1], sents2[a2]
    # print(len(sents2))
    # print(a1, a2)
        
    res = sorted([c1, c2, a1, a2], key=lambda sentence: sentence[0]) # Sort in respect to position in dialog. Order is going to be lexigraphic because type(key) is tuple
    return ''.join([res[1] for res in res])

### Демонстрация работы эвристик: 

In [6]:
# Test heuristic tworand: 
d = "\tCustomer:\t@hulu_support My watchlist is not updating with new episodes (past couple days). Any idea why?\n\tAgent:\t@658975 Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually.\n\tCustomer:\t@hulu_support Tried logging out/back in, that didn’t help\n\tAgent:\t@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!\n\tCustomer:\t@hulu_support Thank you! Some shows updated overnight, but others did not...\n\tAgent:\t@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there\n\tCustomer:\t@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚\n\tAgent:\t@658975 Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚\n"
s = heuristic_tworand(d)
print(d)
print(s)

	Customer:	@hulu_support My watchlist is not updating with new episodes (past couple days). Any idea why?
	Agent:	@658975 Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually.
	Customer:	@hulu_support Tried logging out/back in, that didn’t help
	Agent:	@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!
	Customer:	@hulu_support Thank you! Some shows updated overnight, but others did not...
	Agent:	@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there
	Customer:	@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚
	Agent:	@658975 Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚

In the meantime, try 

In [7]:
# Test heuristic twofirst: 
print("Both customer sentences happen during ONE (first) replic. ")
d = "\tCustomer:\t@hulu_support My watchlist is not updating with new episodes (past couple days). Any idea why?\n\tAgent:\t@658975 Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually.\n\tCustomer:\t@hulu_support Tried logging out/back in, that didn’t help\n\tAgent:\t@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!\n\tCustomer:\t@hulu_support Thank you! Some shows updated overnight, but others did not...\n\tAgent:\t@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there\n\tCustomer:\t@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚\n\tAgent:\t@658975 Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚\n"
print(d)
s = heuristic_twofirst(d)
print('Summary: ')
print(s)


print('-'*100)
print("Two customer sentences happen during different replics. ")
d = "\tCustomer:\t@hulu_support My watchlist is not updating with new episodes (past couple days).\n\tAgent:\t@658975 Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually.\n\tCustomer:\t@hulu_support Tried logging out/back in, that didn’t help\n\tAgent:\t@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!\n\tCustomer:\t@hulu_support Thank you! Some shows updated overnight, but others did not...\n\tAgent:\t@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there\n\tCustomer:\t@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚\n\tAgent:\t@658975 Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚\n"
print(d)
# print(repr(heuristic_twofirst(d)))
print('Summary: ')
print(heuristic_twofirst(d))

print('-'*100)
print("Two customer and agent sentences happen during different replics")
d = "\tCustomer:\t@hulu_support My watchlist is not updating with new episodes (past couple days).\n\tAgent:\t@658975 Apologies for the trouble, Norlene!\n\tCustomer:\t@hulu_support Tried logging out/back in, that didn’t help\n\tAgent:\t@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!\n\tCustomer:\t@hulu_support Thank you! Some shows updated overnight, but others did not...\n\tAgent:\t@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there\n\tCustomer:\t@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚\n\tAgent:\t@658975 Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚\n"
print(d)
# print(repr(heuristic_twofirst(d)))
print('Summary: ')
print(heuristic_twofirst(d))

Both customer sentences happen during ONE (first) replic. 
	Customer:	@hulu_support My watchlist is not updating with new episodes (past couple days). Any idea why?
	Agent:	@658975 Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually.
	Customer:	@hulu_support Tried logging out/back in, that didn’t help
	Agent:	@658975 Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon!
	Customer:	@hulu_support Thank you! Some shows updated overnight, but others did not...
	Agent:	@658975 We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there
	Customer:	@hulu_support As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚
	Agent:	@658975 Awesome! That's what we love to hear. If you happen to need anythi

In [8]:
from collections import defaultdict 

def heuristic_perfomance(heuristic): 
    res = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}
    for dialog, ref_summs in zip(dialog_texts, abstractive_summaries): 
        summ = heuristic(dialog)
        summ = preprocess_summary(summ)
        # print(repr(summ))
        perfomance = defaultdict(list)
        for ref_summ in ref_summs: 
            ref_summ = preprocess_summary(ref_summ)
            # print(repr(ref_summ))
            scores = rouge.get_scores(summ, ref_summ)[0]
            for rouge_type, values in scores.items(): 
                perfomance[rouge_type].append(values['f'])
    
        for rouge_type in res.keys(): 
            res[rouge_type].append(
                max(perfomance[rouge_type])
            )
        # print(perfomance)
        # print('-------')
            
    return {k: np.mean(res[k]) for k in res}

### Perfomance (rouge metric) of 'two first' heuristic:
##### Use different text preprocessing strategies

In [10]:
import string

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res1 = heuristic_perfomance(heuristic_twofirst)

def preprocess_summary(summary): 
    summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res2 = heuristic_perfomance(heuristic_twofirst)

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res3 = heuristic_perfomance(heuristic_twofirst)

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    # summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    # summary = summary.lower()

    # summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res4 = heuristic_perfomance(heuristic_twofirst)

In [11]:
print("Perfomance of 'first two' heuristic")
print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: +\nTo lowercase: +\nRemove punctuation: +")
print(res1)
print()
print("Этапы предобработки \nwhitespace remove: +\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: +\nTo lowercase: +\Remove punctuation: +")
print(res2)
print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': +\nRemove twitter nicknames: +\nTo lowercase: +\nRemove punctuation: +")
print(res3)
print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: -\nTo lowercase: -\nRemove punctuation: -")
print(res4)

Perfomance of 'first two' heuristic

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: +
To lowercase: +
Remove punctuation: +
{'rouge-1': 0.4154450264050774, 'rouge-2': 0.1923380237617137, 'rouge-l': 0.3617657300470574}

Этапы предобработки 
whitespace remove: +
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: +
To lowercase: +\Remove punctuation: +
{'rouge-1': 0.4154450264050774, 'rouge-2': 0.1923380237617137, 'rouge-l': 0.3617657300470574}

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': +
Remove twitter nicknames: +
To lowercase: +
Remove punctuation: +
{'rouge-1': 0.3647122113551269, 'rouge-2': 0.19376768406130715, 'rouge-l': 0.3179945945269569}

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: -
To lowercase: -
Remove punctuation: -
{'rouge-1': 0.3098123227884514, 'rouge-2': 0.15977632690206536, 'rouge-l': 0.28966627455651456}


### Perfomance (rouge metric) of 'two random' heuristic:
##### Use different text preprocessing strategies

In [12]:
N = 5

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 

res1 = [heuristic_perfomance(heuristic_tworand) for _ in range(N)]
res = {}
for k in res1[0]: 
    res[k] = np.mean([p[k] for p in res1])
res1 = res

def preprocess_summary(summary): 
    summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res2 = [heuristic_perfomance(heuristic_tworand) for _ in range(N)]
res = {}
for k in res2[0]: 
    res[k] = np.mean([p[k] for p in res2])
res2 = res

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    summary = summary.replace('Customer:', '').replace('Agent:', '')

    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    summary = summary.lower()

    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res3 = [heuristic_perfomance(heuristic_tworand) for _ in range(N)]
res = {}
for k in res3[0]: 
    res[k] = np.mean([p[k] for p in res3])
res3 = res

def preprocess_summary(summary): 
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    
    # summary = summary.replace('Customer:', '').replace('Agent:', '')

    # summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames 

    # summary = summary.lower()

    # summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
                                                  
    return summary 
    
res4 = [heuristic_perfomance(heuristic_tworand) for _ in range(N)]
res = {}
for k in res4[0]: 
    res[k] = np.mean([p[k] for p in res4])
res4 = res

In [13]:
print("Perfomance of 'two rand' heuristic")

print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: +\nTo lowercase: +\nRemove punctuation: +")
print(res1)
print()
print("Этапы предобработки \nwhitespace remove: +\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: +\nTo lowercase: +\Remove punctuation: +")
print(res2)
print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': +\nRemove twitter nicknames: +\nTo lowercase: +\nRemove punctuation: +")
print(res3)
print()
print("Этапы предобработки \nwhitespace remove: -\nRemove 'Customer:', 'Agent:': -\nRemove twitter nicknames: -\nTo lowercase: -\nRemove punctuation: -")
print(res4)


Perfomance of 'two rand' heuristic

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: +
To lowercase: +
Remove punctuation: +
{'rouge-1': 0.3143723202451151, 'rouge-2': 0.10313277624088446, 'rouge-l': 0.2522732573256695}

Этапы предобработки 
whitespace remove: +
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: +
To lowercase: +\Remove punctuation: +
{'rouge-1': 0.3226747628449368, 'rouge-2': 0.11183747475860724, 'rouge-l': 0.2581429554438765}

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': +
Remove twitter nicknames: +
To lowercase: +
Remove punctuation: +
{'rouge-1': 0.27192515218522517, 'rouge-2': 0.10548400191372065, 'rouge-l': 0.2247026346469933}

Этапы предобработки 
whitespace remove: -
Remove 'Customer:', 'Agent:': -
Remove twitter nicknames: -
To lowercase: -
Remove punctuation: -
{'rouge-1': 0.2478265127610358, 'rouge-2': 0.1010690634046505, 'rouge-l': 0.22741895648388355}
