In [29]:
import pandas as pd
import sys
sys.path.append('../word-embeddings-benchmark')

from web.datasets.similarity import fetch_MTurk, fetch_WS353, fetch_RG65, fetch_RW
from web.datasets.analogy import fetch_google_analogy

#Important for converting embeddings to managable format
from web.embedding import Embedding
from web.embeddings import fetch_HPCA
from web.evaluate import evaluate_similarity, evaluate_analogy

## Embeddings
All embeddings are used without normalization

### GloVe:
400,000 words, 6B tokens with 50, 100, 200, 300 dimensions

### Fast Text
wiki-news 16B tokens with 300 dimensions

### LexVec
Note: LexVec Common Crawl 58B tokens , 300 dimensions was 5GB. Hence uable to load onto memory
Word + Context Vectors, 7B tokens, 300 dimensions

### ConceptNet Number batch
_ tokens, 300 dimensions


In [17]:
#Load time is about 5-10 mins
glove = Embedding.from_glove('../Data/Embeddings/GloVe/glove.6B.300d.txt', 400000, 300)

We ignored line number 400000 because of errors in parsing
index 400000 is out of bounds for axis 0 with size 400000


In [25]:
#Load time is about 10-15 mins
fast_text = Embedding.from_word2vec('../Data/Embeddings/FastText/wiki-news-300d-1M.vec')

In [38]:
#Load time is about 10 mins
lex_vec = Embedding.from_word2vec('../Data/Embeddings/LexVec/lexvec.enwiki+newscrawl.300d.W+C.pos.vectors')

In [44]:
concept_net = Embedding.from_word2vec('../Data/Embeddings/ConceptNet/numberbatch-en.txt')

## Similarity Task


| Embedding  | Dim  |  MTurk |
| ---------- | ---- | ------ |
| Glove      |      |        |
| Glove      |      |        |
| FastText   | 300  | 0.7022 |
| LexVec     | 300  | 0.6480 |
| ConceptNet | 300  | 0.7188 |

In [32]:
def read_txt(file):
    return pd.read_csv(file, header=None, sep=" ")


In [39]:
#MTurk
mturk = read_txt('../Data/Similarity/EN-TRUK.txt')

In [40]:
X = mturk.iloc[:,:2].values
y = mturk.iloc[:,-1].values

In [45]:
evaluate_similarity(concept_net, X, y)

0.7188101234342916

## Analogy

### Google Analogy

Experiments (In terms of overall accuracy on all categories):<br/>
Per parameter results are presented as tuples of (Overall accuracy %, number of corrects, total number of words) <br/>
Results are provided with add rather than mul <br/>
<br/>

<b>GloVe:</b> <br/> 
<i>(Evaluation time: 15 mins on average)</i> <br/>
* 300d : 65%, 12723, 19544
* 200d : 60.8%, 11894, 19544
* 100d : 49.7% , 9730, 19544
* 50d  : 20.4%, 3997, 19544 

<b>FastText:</b> <br/>
<i>(Evaluation time: 20 mins)</i> <br/>
* 300d : 9.2%, 1815, 19544

<b>LexVec </b>
* 300d : 60.4%, 11805, 19544

<b> ConceptNet </b>
* 300d : 31.9%, 6242, 19544

In [3]:
google_analogy = fetch_google_analogy('../Data/Analogy/EN-GOOGLE.txt')

In [46]:
results = evaluate_analogy(concept_net, google_analogy['X'], google_analogy['y'], category=google_analogy['category'])

In [47]:
results

Unnamed: 0,accuracy,correct,count
all,0.319382,6242,19544
gram1-adjective-to-adverb,0.116935,116,992
gram8-plural,0.162162,216,1332
city-in-state,0.158087,390,2467
gram3-comparative,0.554054,738,1332
gram7-past-tense,0.291026,454,1560
gram2-opposite,0.096059,78,812
gram4-superlative,0.680036,763,1122
gram5-present-participle,0.242424,256,1056
gram6-nationality-adjective,0.537211,859,1599


## List of similarity datasets to be evaluated:
MTurk, MEN, WS353, Rubenstein and Goodenough, Rare Words, SimLex999, TR9856 

In [15]:
#Just downloading the files to compare and ensure I have the right datasets


# dataset = fetch_MTurk()  #Downloaded. Yet to be understood
# dataset = fetch_MEN()   #Not Downloaded
# dataset = fetch_WS353("relatedness") #Downloaded. Difference between relatedness and attributional
# dataset = fetch_RG65() #Downloaded : Evaluate
# dataset = fetch_RW() #Downloaded : Downloaded

In [16]:
dataset

{'X': array([['squishing', 'squirt'],
        ['undated', 'undatable'],
        ['circumvents', 'beat'],
        ...,
        ['irredeemable', 'wicked'],
        ['irredeemable', 'inconvertible'],
        ['snickering', 'laugh']], dtype=object),
 'y': array([5.88, 5.83, 5.33, ..., 6.43, 6.57, 7.71]),
 'sd': nan}