# Using premade word embeddings

Download embeddings from (half way down page): https://code.google.com/archive/p/word2vec/

In [1]:
# Import packages
import os

import gensim
import nltk
import pandas as pd


import warnings
warnings.filterwarnings('ignore')

## Load pre-trained vectors from file

In [2]:
# Data directory
data_dir = "/Users/clbi/Documents/conferences/UiO_NLP_Oct2019/data/"

# Load vectors 
gpath = os.path.join(data_dir, "GoogleNews-vectors-negative300.bin")
google_w2v = gensim.models.KeyedVectors.load_word2vec_format(gpath,
                                                             binary=True)

## Basic word vector maths

king - man + woman

In [3]:
print (google_w2v.most_similar(positive=["king","woman"], negative = ["man"])[0])

('queen', 0.7118192911148071)


A more useful example: 
    capital of Scotland?

In [4]:
print (google_w2v.most_similar(positive=["Oslo","Scotland"], negative = ["Norway"])[0])

('Edinburgh', 0.7040923833847046)


# Custom word embeddings

<img src="corpi.png" alt="alt text" width="1000"/>

In [5]:
text_filepath = os.path.join(data_dir, "tmp.txt")

In [6]:
# Read in text line by line
with open(text_filepath) as f:
    content = f.readlines()
    
# remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

In [7]:
# take a quick look at what has been read in
content[:5]

['You’re about to spend several hours of your life reading about Git. Let’s take a minute to explain',
 'what we have in store for you. Here is a quick summary of the ten chapters and three appendices of',
 'this book.',
 'In Chapter 1, we’re going to cover Version Control Systems (VCSs) and Git basics — no technical',
 'stuff, just what Git is, why it came about in a land full of VCSs, what sets it apart, and why so many']

In [8]:
# check how many lines
len(content)

14745

### The input for training the model requires a list of tokenised sentences 

In [9]:
# Join lines together so it becomes one long line
text = " ".join(content)

# Separate out the sentences 
sentences = nltk.sent_tokenize(text)

# Seperate out each word within each sentence
tokenised_sents = [nltk.word_tokenize(sent.lower()) for sent in sentences]

### Check how large our vocabulary is

In [10]:
total_tokens = [t for sent in tokenised_sents for t in sent]

print (len(total_tokens))
print (len(set(total_tokens)))

169587
9103


In [11]:
print (sentences[0])
print (tokenised_sents[0])

You’re about to spend several hours of your life reading about Git.
['you', '’', 're', 'about', 'to', 'spend', 'several', 'hours', 'of', 'your', 'life', 'reading', 'about', 'git', '.']


### Skipgram model creation from the tokenised sentences

In [12]:
# Skip-gram model
git_sg = gensim.models.Word2Vec(tokenised_sents, sg=1, min_count=2, window=5, size=300)
git_sg.train(tokenised_sents, total_examples=len(tokenised_sents), epochs=250)

(29377357, 42396750)

### Continuous Bag of Words model creation from the tokenised sentences

In [13]:
# CBoW model
git_cbow = gensim.models.Word2Vec(tokenised_sents, sg=0, min_count=2, window=5, size=300)
git_cbow.train(tokenised_sents, total_examples=len(tokenised_sents), epochs=250)

(29375361, 42396750)

## Comparing the two models and the pretrained embeddings

In [14]:
def comparing_embeddings(word, g_emb, sg_emb, cbow_emb):
    g    = pd.DataFrame(g_emb.most_similar(positive=[word])[:5],columns=["g_name","g_score"])
    sg   = pd.DataFrame(sg_emb.most_similar(positive=[word])[:5],columns=["sg_name","sg_score"])
    cbow = pd.DataFrame(cbow_emb.most_similar(positive=[word])[:5],columns=["cbow_name","cbow_score"])
    
    df = pd.concat([g, sg, cbow],axis = 1)
    display (df)

In [15]:
word = 'issue' 

comparing_embeddings(word, google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,issues,0.707202,53,0.437099,bug,0.339078
1,thorny_issue,0.598144,ticket,0.321185,num,0.289421
2,problem,0.564546,iss91v2,0.308129,comments,0.287643
3,isssue,0.561899,concentrate,0.295148,avatar,0.28665
4,topic,0.552595,usability,0.290826,platform,0.28647


In [16]:
comparing_embeddings("branch", google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,branches,0.774664,master,0.395214,branches,0.405472
1,Branches,0.580219,9fd905e,0.332735,master,0.393785
2,braches,0.556314,branches,0.330581,work,0.356721
3,offices,0.547974,sf,0.329037,merge,0.354294
4,bank,0.54309,'origin/master,0.324644,repository,0.335536


In [17]:
comparing_embeddings("repository", google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,repositories,0.768756,copy,0.294594,project,0.462332
1,searchable_repository,0.628594,my_project,0.285671,attach,0.35324
2,centralized_repository,0.625289,clone,0.283222,branch,0.335536
3,database,0.603871,133,0.273846,server,0.315999
4,metadata_repository,0.557632,initialize,0.263398,directory,0.310902


In [18]:
comparing_embeddings("master", google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,masters,0.631445,branch,0.395214,origin/master,0.425256
1,Rolodex_Khuzami,0.571832,'origin/master,0.371425,branch,0.393785
2,Master,0.569733,slower-blink,0.368018,fetch_head,0.387405
3,suspense_Alfred_Hitchcock,0.563138,.git/objects/pack/pack-e80e,0.360461,iss53,0.38054
4,Gary_Golkiewicz_chief,0.537977,fbff5bc,0.359547,serverfix,0.377777


In [19]:
comparing_embeddings("rebase", google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,rerate,0.521535,rewinding,0.39897,amend,0.321743
1,rebasing,0.50749,cherry-pick,0.385342,rewinding,0.307802
2,Cizdyn,0.470737,force-pushed,0.381862,cherry-pick,0.280399
3,mediumterm,0.470616,rebases,0.361529,filter-branch,0.276505
4,longerterm,0.462676,work..,0.357625,-i,0.273116


In [23]:
comparing_embeddings("owner", google_w2v, git_sg, git_cbow)

Unnamed: 0,g_name,g_score,sg_name,sg_score,cbow_name,cbow_score
0,Owner,0.758827,contributor,0.400518,33,0.358958
1,proprietor,0.704992,closes,0.381352,clicking,0.348276
2,owners,0.687797,commenting,0.377992,group,0.32917
3,coowner,0.625816,suggested,0.377306,members,0.31898
4,owns,0.545936,thread,0.364047,results,0.315952
