In [1]:
import os
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import re
from nltk.tokenize import word_tokenize
import sklearn

## Load whole dataset to work with DA patterns

In [2]:
df = pd.read_csv('./data_TM2/synt_annotated_data.csv', index_col=0)

# doc2vec working
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

training our model using the Lee Background Corpus included in gensim. This corpus contains 314 documents selected from the Australian Broadcasting

In [22]:
import os
import gensim
from sklearn.model_selection import train_test_split
import smart_open


train, test = train_test_split(df,train_size=393180, shuffle=False) #split taking into account full dialogue. Not splitting in middle of dialogue
train, val = train_test_split(train,train_size=0.1, shuffle=False) #só pra testar coisas 0.001 funciona bem! 0.01 20s 

Below, we define a function to:

- open the train/test file (with latin encoding)

- read the file line-by-line

- pre-process each line (tokenize text into individual words, remove punctuation, set to lowercase, etc)

The file we’re reading is a corpus. Each line of the file is a document.

In [23]:
def read_corpus(fname, tokens_only=False):
#     with smart_open.open(fname, encoding="iso-8859-1") as f: #### FOR MY CODE COMMENT THIS LINE #####
        for i, line in enumerate(fname): #### FOR MY CODE PUT (fname) ##### default is (f)
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# train_corpus = list(read_corpus(lee_train_file))
# test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

train_corpus = list(read_corpus(train['new_text']))
test_corpus = list(read_corpus(test['new_text'], tokens_only=True))

In [24]:
#training corpus one instance example
print(test_corpus[:2])
len(test_corpus)

[['what', 'is', 'the', 'latest', 'headlines', 'related', 'to', 'the', 'philadelphia', 'phillies'], ['series', 'preview', 'philadelphia', 'phillies', 'at', 'milwaukee', 'brewers', 'mlb', 'rumors', 'latest', 'sign', 'phillies', 'will', 'make', 'full', 'court', 'press', 'for', 'manny', 'machado']]


43703

Training the Model

Now, we’ll instantiate a Doc2Vec model with a vector size with 50 dimensions and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences. (Without a variety of representative examples, retaining such infrequent words can often make a model worse!) Typical iteration counts in the published Paragraph Vector paper results, using 10s-of-thousands to millions of docs, are 10-20. More iterations take more time and eventually reach a point of diminishing returns.

However, this is a very very small dataset (300 documents) with shortish documents (a few hundred words). Adding training passes can sometimes help with such small datasets.

In [25]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)

#Build a vocabulary
model.build_vocab(train_corpus)

Essentially, the vocabulary is a list (accessible via model.wv.index_to_key) of all of the unique words extracted from the training corpus. 

Additional attributes for each word are available using the model.wv.get_vecattr() method 

For example, to see how many times penalty appeared in the training corpus:

In [26]:
print(f"Word 'hello' appeared {model.wv.get_vecattr('hello', 'count')} times in the training corpus.")

model.wv.get_vecattr('hello', 'count')

Word 'hello' appeared 625 times in the training corpus.


625

Next, train the model on the corpus. If optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds. If the BLAS library is not being used, this should take no more than 2 minutes, so use optimized Gensim with BLAS if you value your time.

In [None]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("my_doc2vec_model")

model.save(fname)
model = gensim.models.doc2vec.Doc2Vec.load(fname)

## Clean code for Window part:

In [None]:
from sklearn.metrics import pairwise


def paraphrasing(conversation):
    sentences_window = {}
    nrows = len(conversation['new_text'])
    sentences = conversation['new_text']
    rep_initiators = np.array(conversation['DA_rep_init'])
    window_width = 2
    dialog_id = conversation['conversation_id'].unique()[0]

    for k,sentence,rep_init in zip(range(nrows),sentences,rep_initiators):
        if rep_init== "repair_initiator":
            sentences_window[sentence] = sentences[k-window_width:k+window_width+1]


    rep=pd.DataFrame(sentences_window)#.transpose()
    repair_tok = list(read_corpus(rep.iloc[:,0], tokens_only=True))


    inf_vecs = []
    [inf_vecs.append(model.infer_vector(ut)) for ut in repair_tok]  


    threshold = 0.3
    paraphrasing = []
    paraph = {}

    for e in range(len(inf_vecs)):
        counter = e+1
        while counter < len(inf_vecs):
            pair_sim = pairwise.cosine_similarity(inf_vecs[e].reshape(1, -1), inf_vecs[counter].reshape(1, -1))
            if pair_sim >= threshold:
                paraphrasing.append((e, rep.iloc[e,0],counter, rep.iloc[counter,0],pair_sim)) #((e,counter, pair_sim))#
            counter +=1
    paraph[dialog_id] = paraphrasing

    return paraphrasing # paraph 
    #dict in which key in dialogue number and values are tuples that passes threshold. 
    #Each tuple contains: index_sent_1, sent1, index_sent2, sent2, cosine_similarity

#works! selects dialogue ids with repair initiators

dial_w_rep = []
for row in range(len(df)):
    if df['DA_rep_init'][row] == 'repair_initiator':
        dial_w_rep.append(df['conversation_id'][row])
dial_w_rep = list(set(dial_w_rep))

#call function for all dialogues w repair and save in dict in which key = dialogue id and value is the cosines

paraphr = {}
count =0
for dialogue in dial_w_rep:
    try:
        conversation = df.loc[df['conversation_id'] == dialogue]
        paraphr[dialogue] = paraphrasing(conversation)
    except Exception:
        count+=1
        pass
    
print(count)
print(len(dial_w_rep)) 

In [None]:
# from sklearn.metrics import pairwise

# conversation1 = df.loc[df['conversation_id'] == 'dlg-00e32998-0b0f-47f1-a4f0-2ce90f1718d0']

# sentences_window = {}
# nrows = len(conversation1['new_text'])
# sentences = conversation1['new_text']
# rep_initiators = np.array(conversation1['DA_rep_init'])
# window_width = 2

# for k,sentence,rep_init in zip(range(nrows),sentences,rep_initiators):
#     if rep_init== "repair_initiator":
#         sentences_window[sentence] = sentences[k-window_width:k+window_width+1]

# rep=pd.DataFrame(sentences_window)#.transpose()
# repair_tok = list(read_corpus(rep.iloc[:,0], tokens_only=True))

# inf_vecs = []
# [inf_vecs.append(model.infer_vector(ut)) for ut in repair_tok]  

# threshold = 0.2
# paraphrasing = []

# for e in range(len(inf_vecs)):
#     counter = e+1
#     while counter < len(inf_vecs):
#         pair_sim = pairwise.cosine_similarity(inf_vecs[e].reshape(1, -1), inf_vecs[counter].reshape(1, -1))
#         print(e, counter, pair_sim)
#         if pair_sim >= threshold:
#             print('YES', str(e), str(counter), str(pair_sim))
#             paraphrasing.append((e,counter,pair_sim))
#         counter +=1

# paraphrasing