In [1]:
import os
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import re
from nltk.tokenize import word_tokenize
import sklearn

import os
import gensim
from sklearn.model_selection import train_test_split
import smart_open

from gensim.test.utils import get_tmpfile

from sklearn.metrics import pairwise

## Load whole dataset to work with DA patterns

In [2]:
# df = pd.read_csv('./data_TM2/synt_annotated_data.csv', index_col=0)
df = pd.read_csv('./data_TM2/processed/processed_utterances_sentence_DA_labeling.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,task,conversation_id,instruction_id,index,speaker,text,segments,new_text,DA_rep_init,DA_greet,...,DA_receipt,DA_disconf,DA_closer,DA_comp_check,DA_hold,DA_partial_req,DA_detail_req,DA_grant,DA_answer,all_DA
0,flights,dlg-00100680-00e0-40fe-8321-6d81b21bfc4f,flight-12,0,U,Hello. I'd like to find a round trip commercia...,"[{'start_index': 26, 'end_index': 36, 'text': ...",Hello.,,U_greeting,...,,,,,,,,,,['U_greeting']
1,flights,dlg-00100680-00e0-40fe-8321-6d81b21bfc4f,flight-12,0,U,Hello. I'd like to find a round trip commercia...,"[{'start_index': 26, 'end_index': 36, 'text': ...",I'd like to find a round trip commercial airli...,,,...,,,,,,,,,,[]
2,flights,dlg-00100680-00e0-40fe-8321-6d81b21bfc4f,flight-12,1,A,"Hello, how can I help you?",,"Hello, how can I help you?",,A_greeting,...,,,,,,,A_detail_request,,,"['A_greeting', 'A_detail_request']"
3,flights,dlg-00100680-00e0-40fe-8321-6d81b21bfc4f,flight-12,2,A,"San Francisco to Denver, got it.","[{'start_index': 0, 'end_index': 13, 'text': '...","San Francisco to Denver, got it.",,,...,,,,,,,,,,['A_confirmation']
4,flights,dlg-00100680-00e0-40fe-8321-6d81b21bfc4f,flight-12,3,U,You're really on top of things. I like that.,,You're really on top of things.,,U_greeting,...,,,,,,,,,,['U_greeting']


# doc2vec working
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [4]:
train, test = train_test_split(df,train_size=393180, shuffle=False) #split taking into account full dialogue. Not splitting in middle of dialogue
train, val = train_test_split(train,train_size=0.1, shuffle=False) #só pra testar coisas 0.001 funciona bem! 0.01 20s 

Below, we define a function to:

- open the train/test file (with latin encoding)

- read the file line-by-line

- pre-process each line (tokenize text into individual words, remove punctuation, set to lowercase, etc)

The file we’re reading is a corpus. Each line of the file is a document.

In [5]:
def read_corpus(fname, tokens_only=False):
        for i, line in enumerate(fname): 
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(train['new_text']))
test_corpus = list(read_corpus(test['new_text'], tokens_only=True))

#training corpus one instance example
print(test_corpus[:2])
len(test_corpus)

[['what', 'is', 'the', 'latest', 'headlines', 'related', 'to', 'the', 'philadelphia', 'phillies'], ['series', 'preview', 'philadelphia', 'phillies', 'at', 'milwaukee', 'brewers', 'mlb', 'rumors', 'latest', 'sign', 'phillies', 'will', 'make', 'full', 'court', 'press', 'for', 'manny', 'machado']]


43703

Training the Model

Now, we’ll instantiate a Doc2Vec model with a vector size with 50 dimensions and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences. (Without a variety of representative examples, retaining such infrequent words can often make a model worse!) Typical iteration counts in the published Paragraph Vector paper results, using 10s-of-thousands to millions of docs, are 10-20. More iterations take more time and eventually reach a point of diminishing returns.

However, this is a very very small dataset (300 documents) with shortish documents (a few hundred words). Adding training passes can sometimes help with such small datasets.

In [6]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)

#Build a vocabulary
model.build_vocab(train_corpus)

Essentially, the vocabulary is a list (accessible via model.wv.index_to_key) of all of the unique words extracted from the training corpus. 

Additional attributes for each word are available using the model.wv.get_vecattr() method 

For example, to see how many times penalty appeared in the training corpus:

In [7]:
print(f"Word 'hello' appeared {model.wv.get_vecattr('hello', 'count')} times in the training corpus.")

model.wv.get_vecattr('hello', 'count')

Word 'hello' appeared 625 times in the training corpus.


625

Next, train the model on the corpus. If optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds. If the BLAS library is not being used, this should take no more than 2 minutes, so use optimized Gensim with BLAS if you value your time.

In [8]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 1min 53s, sys: 26.5 s, total: 2min 19s
Wall time: 1min 52s


In [9]:
fname = get_tmpfile("my_doc2vec_model")

model.save(fname)
model = gensim.models.doc2vec.Doc2Vec.load(fname)

## Similarity window:

In [10]:
df['unique'] = [n for n in range(len(df))]

In [11]:
import collections

print(collections.Counter(df['DA_rep_init']))
repair_initiator = ['A_repair_initiator', 'U_repair_initiator' ]
temp2=df.DA_rep_init.fillna("0")
df['DA_rep_init_new'] =  np.where(temp2.str.contains('|'.join(repair_initiator), case=False), "repair_initiator", np.nan)
collections.Counter(df['DA_rep_init_new'])

Counter({nan: 434770, 'A_repair_initiator': 1562, 'U_repair_initiator': 551})


Counter({'nan': 434770, 'repair_initiator': 2113})

In [12]:
def paraphrasing(conversation):
    '''Output either similarity or paraphrase.
    paraphrase is the second sentence said in a sequence of time.
    similarity gives the pairs of similarities with its cosines.
    Each tuple contains: index_sent_1, sent1, index_sent2, sent2, cosine_similarity
    '''
    
    sentences_window = {}
    nrows = len(conversation['new_text'])
    sentences = conversation['new_text']
    rep_initiators = np.array(conversation['DA_rep_init_new'])
    window_width = 2
    dialog_id = conversation['conversation_id'].unique()[0]

    for k,sentence,rep_init in zip(range(nrows),sentences,rep_initiators):
        if rep_init== "repair_initiator":
            sentences_window[sentence] = sentences[k-window_width:k+window_width+1]


    rep=pd.DataFrame(sentences_window)
    repair_tok = list(read_corpus(rep.iloc[:,0], tokens_only=True))

    inf_vecs = []
    [inf_vecs.append(model.infer_vector(ut)) for ut in repair_tok]  


    threshold = 0.85
    similarity = []
    paraphrase = []

    for e in range(len(inf_vecs)):
        counter = e+1
        while counter < len(inf_vecs):
            pair_sim = pairwise.cosine_similarity(inf_vecs[e].reshape(1, -1), inf_vecs[counter].reshape(1, -1))
            if pair_sim >= threshold:
                similarity.append((e, rep.iloc[e,0],counter, rep.iloc[counter,0],pair_sim)) #((e,counter, pair_sim))
                paraphrase.append(rep.iloc[counter,0])
            counter +=1

    return similarity #paraphrase


In [13]:
def match_similarity_w_dialogueid(df):
    
    #select dialogue ids with repair initiators
    dial_w_rep = []
    for row in range(len(df)):
        if df['DA_rep_init_new'][row] == 'repair_initiator':
            dial_w_rep.append(df['conversation_id'][row])
    dial_w_rep = list(set(dial_w_rep))

    #call function for all dialogues with repair initiator. save in dict in which key=dialogue id and value=paraphrase
    paraphr = {}
    count =0
    for dialogue in dial_w_rep:
        try:
            conversation = df.loc[df['conversation_id'] == dialogue]
            paraphr[dialogue] = paraphrasing(conversation)
        except Exception:
            count+=1
            pass

    print(count) #number of times it could not compute the similarity function
    print(len(dial_w_rep))

    return paraphr

paraphrasing = match_similarity_w_dialogueid(df)

219
1813


In [14]:
import pickle 

with open('./src/generated_files/paraphrasing_cos_sim085.pkl', 'wb') as f:
    pickle.dump(paraphrasing, f)

In [16]:
new_par = {k:v for k,v in paraphrasing.items() if v != []}
new_par

{'dlg-d9395a12-ec4f-48cc-8906-a4cf04f152e6': [(0,
   'Hello, I want to listen to some music by Deicide.',
   3,
   'I want to listen to some music by Deicide.',
   array([[0.90131724]], dtype=float32))],
 'dlg-b61c1943-962e-4814-ac7e-7fee38e24a0e': [(0,
   'All right.',
   3,
   'All right.',
   array([[0.9892622]], dtype=float32))],
 'dlg-779e0dff-2e53-405a-8f29-b707539dac0e': [(1,
   'Okay.',
   4,
   'Okay.',
   array([[0.9390416]], dtype=float32))],
 'dlg-ee6acad7-b5b0-4db2-a418-43c6a2c2525f': [(0,
   "I'm looking for a restaurant to eat dinner in Baltimore, Maryland near John Hopkins University.",
   3,
   "I'm looking for a restaurant to eat dinner in Baltimore, Maryland, near Johns Hopkins University.",
   array([[0.8793271]], dtype=float32))],
 'dlg-2f1f2f78-3cec-4b46-900f-f18749f0b60b': [(1,
   'What type of hotels do you have?',
   3,
   'What type of rooms do you have?',
   array([[0.9677365]], dtype=float32))],
 'dlg-e69f9d96-3bb8-4af1-9e3b-ee9bbacec211': [(1,
   'Drury inn

In [17]:
counter = 0
for k, v in new_par.items():
    counter += len(new_par.get(k))
    
counter

323

In [19]:
with open('/Users/brunaguedes/Documents/GitHub/NLP_Automatic_dialog_extraction/src/generated_files/paraphrasing_cos_sim085.pkl', 'wb') as fp:
        pickle.dump(new_par, fp)

#### Now add similarity as a column in dataframe.

#### !!! This is not yet working since it generates more examples than the corresponding lines in dataframe.

In [15]:
# #create column in dataframe with rephrase

# DA_paraphrase = []

# # for key, value in paraphr.items():
# #     if len(value) != 0:
# # #         print(type(value[0]))
# # #         print(key, value, len(value))
# #         for v in value:
# # #             print(key, v)
# # #             df['DA_paraphrase'] = np.where(((df['conversation_id'] == key) & (df['new_text'] == v)), 'Paraphrase', '')                                     
# #             for row in range(len(df)):
# #                 if df['conversation_id'][row] == key & df['new_text'][row] == v:
# #                     DA_paraphrase.append(1)
# #                 else:
# #                     DA_paraphrase.append(0)

# for row in range(len(df)):
#     for key, value in paraphr.items():
#         if len(value) != 0:
#             if (df['conversation_id'][row] == key) & (df['new_text'][row] == max(value)):
#                 DA_paraphrase.append(1)
#         else:
#             DA_paraphrase.append(0)

# df['DA_paraphrase'] = DA_paraphrase

# (df['conversation_id'][0] == 'dlg-00100680-00e0-40fe-8321-6d81b21bfc4f') & (df['new_text'][0] == 'Hello.')

# len(DA_paraphrase)