# Introduction

Notebook used for training a new Word Embedding as extension of the GloVe-based Word Embedding available trained with the Common Crawln words dataset.

# Load Libraries


In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from gensim.models import Word2Vec

from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

selected_bugreports = fd.Datasets.read_selected_bugreports_df()
all_bugreports = fd.OrigDatasets.read_orig_bugreports_df()
testcases = fd.Datasets.read_testcases_df()
features = fd.Datasets.read_features_df()

SelectedBugReports.shape: (91, 18)
OrigBugReports.shape: (35336, 18)
TestCases.shape: (195, 12)
Features.shape: (19, 8)


# Load Sentences

The sentences used were the bug reports not used in the rest of the research (91 bug reports).

In [2]:
tokenizer = tok.PorterStemmerBased_Tokenizer()

all_sentences = []
for idx,br in all_bugreports.iterrows():
    if br.Bug_Number not in selected_bugreports.Bug_Number:
        all_sentences.append(tokenizer.__call__(br.br_desc))

# Training Model with Gensim Word2Vec

In [3]:
model = Word2Vec(all_sentences, 
                 min_count=3,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=2,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30)

In [4]:
len(model.wv.vocab)

10661

In [5]:
model.most_similar('awesom')

  """Entry point for launching an IPython kernel.


[('locat', 0.5503432154655457),
 ('adress', 0.48869913816452026),
 ('address', 0.486824095249176),
 ('search', 0.47909674048423767),
 ('url', 0.4405985474586487),
 ('nav', 0.4212028384208679),
 ('awesomebar', 0.3989861011505127),
 ('titl', 0.3644767999649048),
 ('foo', 0.36121606826782227),
 ('overlaid', 0.35000863671302795)]

# Save Trained Model

In [9]:
cust_model_path = fd.FilePath.CUST_WORD_EMBEDDING.value
model.wv.save_word2vec_format(cust_model_path)

# Execute the commands

Run the commands into __add\_model\_to\_spacy.sh__ script now

Source: https://stackoverflow.com/questions/50466643/in-spacy-how-to-use-your-own-word2vec-model-created-in-gensim

# Test Loading Model with SpaCy

In [10]:
import spacy
nlp = spacy.load(cust_model_path.replace('.txt',''))

In [11]:
nlp.vocab.length

11575