<a href="https://colab.research.google.com/github/euler16/Exploration/blob/master/Word_Embedding_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Downloading fasttext word vectors
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip
!ls

--2019-11-18 22:03:00--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2019-11-18 22:04:38 (6.73 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   
sample_data  wiki-news-300d-1M.vec  wiki-news-300d-1M.vec.zip


In [0]:
import argparse
import gzip
import math
import numpy as np
import re
import sys
import gensim
from copy import deepcopy

#for tables in Jupyter
from IPython.display import HTML, display
import tabulate


# forgetting about visualization at the moment
from gensim import models
from sklearn.decomposition import PCA as sPCA
from sklearn import manifold #MSD, t-SNE

In [0]:

isNumber = re.compile(r'\d+.*')
def norm_word(word):
  if isNumber.search(word.lower()):
    return '---num---'
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
    return word.lower()

def read_lexicon(filename):
  lexicon = {}
  for line in open(filename, 'r'):
    words = line.lower().strip().split()
    lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
  return lexicon

In [3]:
# load word vectors

def read_word_vecs(filename='wiki-news-300d-1M.vec'):
  wordVectors = {}
  if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
  else: fileObject = open(filename, 'r')
  fileObject.readline()
  for line in fileObject:
    line = line.strip().lower()
    word = line.split()[0]
    wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
    for index, vecVal in enumerate(line.split()[1:]):
      wordVectors[word][index] = float(vecVal)
    ''' normalize weight vector '''
    wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
    
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors

wv = read_word_vecs()

Vectors read from: wiki-news-300d-1M.vec 


In [0]:
# retrofitting code
def retrofit(wordVecs, lexicon, numIters):
  newWordVecs = deepcopy(wordVecs)
  wvVocab = set(newWordVecs.keys())
  loopVocab = wvVocab.intersection(set(lexicon.keys()))
  for it in range(numIters):
    # loop through every node also in ontology (else just use data estimate)
    for word in loopVocab:
      wordNeighbours = set(lexicon[word]).intersection(wvVocab)
      numNeighbours = len(wordNeighbours)
      #no neighbours, pass - use data estimate
      if numNeighbours == 0:
        continue
      # the weight of the data estimate if the number of neighbours
      newVec = numNeighbours * wordVecs[word]
      # loop over neighbours and add to new vector (currently with weight 1)
      for ppWord in wordNeighbours:
        newVec += newWordVecs[ppWord]
      newWordVecs[word] = newVec/(2*numNeighbours)
  return newWordVecs


In [5]:
# getting the lexicon
!wget https://github.com/mfaruqui/retrofitting/raw/master/lexicons/ppdb-xl.txt
!ls

--2019-11-18 22:18:36--  https://github.com/mfaruqui/retrofitting/raw/master/lexicons/ppdb-xl.txt
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mfaruqui/retrofitting/master/lexicons/ppdb-xl.txt [following]
--2019-11-18 22:18:37--  https://raw.githubusercontent.com/mfaruqui/retrofitting/master/lexicons/ppdb-xl.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3948382 (3.8M) [text/plain]
Saving to: ‘ppdb-xl.txt.1’


2019-11-18 22:18:38 (45.1 MB/s) - ‘ppdb-xl.txt.1’ saved [3948382/3948382]

ppdb-xl.txt    sample_data	      wiki-news-300d-1M.vec.zip
ppdb-xl.txt.1  wiki-news-300d-1M.vec


In [0]:
lexicon = read_lexicon('ppdb-xl.txt')
wv_retro = retrofit(wv, lexicon, 100)

In [7]:
sum([np.sum(wv[x]-wv_retro[x]) for x in wv.keys()])

5913.370200771669

In [0]:
# calculating distortion
import itertools
from random import random
def distortion(wv, wv_retro):
  '''
    expansion = max(d(f(x),f(y))/d(x,y))
  '''
  delta = 0.00000001
  cp = list(itertools.product(list(wv.keys()), list(wv_retro.keys())))
  cp = ((x,y) for x,y in cp if x != y)
  cp = random.shuffle(cp)[:1000]
  e =  0.0
  c = 0.0
  tmp = np.empty_like(wv[list(wv.keys())[0]])
  tmp2 = np.empty_like(wv[list(wv.keys())[0]])
  for x,y in cp:
    np.square(wv_retro[x]-wv_retro[y], out=tmp)
    np.sum(tmp, out=tmp)
    np.sqrt(tmp, out=tmp)

    np.square(wv[x]-wv[y], out=tmp2)
    np.sum(tmp2, out=tmp2)
    np.sqrt(tmp2, out=tmp2)

    efrac = tmp /(tmp2 + delta)
    cfrac = tmp2 /(tmp + delta)

    e = max(e, efrac)
    c = max(c, cfrac)

  return e*c

In [0]:
distortion(wv, wv_retro)