In [1]:
from gensim.models import Word2Vec
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
import contractions
from gensim.models import KeyedVectors

In [2]:
def load_glove_model(glove_file_path):
    glove_model = KeyedVectors.load_word2vec_format(glove_file_path, binary=False, no_header=True)
    return glove_model

In [3]:
tokenizer = WordPunctTokenizer()

In [4]:
train = pd.read_json('../data/raw/dialogsum/dialogsum.train.jsonl', lines = True)[['dialogue', 'summary']]

In [5]:
def tokenize(text):
    text = contractions.fix(text)
    tokens = tokenizer.tokenize(text)
    clean_tokens = []
    for token in tokens:
        clean_tokens.append(token)
    text = ' '.join(clean_tokens).lower()
    text = text.replace('# person1 #', '#person1#')
    text = text.replace('# person2 #', '#person2#')
    text = text.replace('# person3 #', '#person3#')
    text = text.replace('# person4 #', '#person4#')
    text = text.replace('# person5 #', '#person5#')
    text = text.replace('# person6 #', '#person6#')
    text = text.replace('# person7 #', '#person7#')
    text = text.replace(' ,', ',')
    text = text.replace(' .', '.')
    text = text.replace(' ?', '?')
    text = text.replace(' !', '!')
    text = text.replace(" ' ", "'")
    return text.split()

In [6]:
concat = pd.concat([train['dialogue'], train['summary']])

In [7]:
tokens = list(concat.apply(lambda x : tokenize(x)))

In [8]:
base_model = Word2Vec(vector_size=300, min_count = 20, epochs=20)
base_model.build_vocab(tokens)
total_examples = base_model.corpus_count

In [9]:
base_model.wv.most_similar('#person1#', topn=10)

[('name?', 0.20081743597984314),
 ('pale.', 0.1952628791332245),
 ('francisco', 0.1833544671535492),
 ('flu.', 0.1742691844701767),
 ('duties', 0.17293964326381683),
 ('won', 0.1703324317932129),
 ('mom.', 0.16947486996650696),
 ('university.', 0.16828463971614838),
 ('drama', 0.16771653294563293),
 ('early.', 0.16720540821552277)]

In [10]:
corpus_path = '../embeds/GloVe/glove.corpus.300d.txt'
corpus_model = load_glove_model(corpus_path)
base_model.build_vocab([list(corpus_model.key_to_index.keys())], update=True)

In [11]:
corpus_model.most_similar('#person1#', topn=10)

[('tells', 0.6852829456329346),
 ('asks', 0.6342537999153137),
 ('suggests', 0.5803223848342896),
 ('#person2#', 0.5709367394447327),
 ('recommends', 0.5490970015525818),
 ('thinks', 0.538934588432312),
 ('advises', 0.530464768409729),
 ('#person1#.', 0.5236577391624451),
 ('complains', 0.5215713977813721),
 ('wants', 0.5100038051605225)]

In [12]:
base_model.train(tokens, total_examples=total_examples, epochs=base_model.epochs)
base_model_wv = base_model.wv


In [13]:
base_model_wv.most_similar('#person1#', topn=10)

[('#person2#', 0.8051125407218933),
 ('#person3#', 0.5365425944328308),
 ('sam', 0.5109261870384216),
 ('jeff', 0.4995196461677551),
 ('#person2#.', 0.4818013310432434),
 ('amy', 0.47963079810142517),
 ('#person2#,', 0.4759983420372009),
 ('#person1#.', 0.4756500720977783),
 ('#person1#,', 0.47492074966430664),
 ('chris', 0.4728524088859558)]

In [14]:
base_model_wv.save_word2vec_format('../models/GloVe-Word2Vec/glove.bin')