In [1]:
import pandas as pd
import sys
sys.path.append('../word-embeddings-benchmark')

from web.datasets.similarity import fetch_MTurk, fetch_MEN, fetch_WS353, fetch_RG65, fetch_RW, fetch_SimLex999, fetch_TR9856
from web.datasets.analogy import fetch_google_analogy

#Important for converting embeddings to managable format
from web.embedding import Embedding
from web.embeddings import fetch_HPCA
from web.evaluate import evaluate_similarity, evaluate_analogy

## Embeddings
All embeddings are used without normalization

### GloVe:
400,000 words, 6B tokens with 50, 100, 200, 300 dimensions

### Fast Text
wiki-news 16B tokens with 300 dimensions

### LexVec
Note: LexVec Common Crawl 58B tokens , 300 dimensions was 5GB. Hence uable to load onto memory
Word + Context Vectors, 7B tokens, 300 dimensions

### ConceptNet Number batch
_ tokens, 300 dimensions


In [10]:
#Load time is about 5-10 mins
glove = Embedding.from_glove('../Data/Embeddings/GloVe/glove.6B.300d.txt', 400000, 300)

We ignored line number 400000 because of errors in parsing
index 400000 is out of bounds for axis 0 with size 400000


In [11]:
#Load time is about 10-15 mins
fast_text = Embedding.from_word2vec('../Data/Embeddings/FastText/wiki-news-300d-1M.vec')

In [12]:
#Load time is about 10 mins
lex_vec = Embedding.from_word2vec('../Data/Embeddings/LexVec/lexvec.enwiki+newscrawl.300d.W+C.pos.vectors')

In [13]:
concept_net = Embedding.from_word2vec('../Data/Embeddings/ConceptNet/numberbatch-en.txt')

## Similarity Task

MTurk <br/>
Evaluated by 10 people on a scale of  1 - 5. Scores are multipled by 2

MEN <br/>
Pairs: 3000
Scores range : 
Natural form not lemmatized form. Scores were rescaled to stay within 0 - 10 range

WS353 <br/>
Pairs: 353
Scores range : 0 - 10

RG65 <br/>
Pairs: 65
Scores range : 0 - 4. Scaled by factor 10/4

Rare Words <br/>
Pairs: 2034
Scores range : 0 - 10

SimLex 999 <br/>
Pairs: 999
Scores range : 0 - 10

TR9856 <br/>
Pairs: 9856
Scores range : 0 - 1


| Embedding  | Dim  |  MTurk |  MEN  | WS353 | RG65 |  RW  | Sim999 |  TR9856  |  
| ---------- | ---- | ------ |
| Glove      |      |        |
| Glove      |      |        |
| FastText   | 300  | 0.7022 |
| LexVec     | 300  | 0.6480 |
| ConceptNet | 300  | 0.7188 |

In [15]:
#All Similiarty datasets
mturk = fetch_MTurk('../Data/Similarity/EN-TRUK.txt')
men = fetch_MEN('../Data/Similarity/EN-MEN-LEM.txt')
ws353 = fetch_WS353('../Data/Similarity/WS353_combined.csv')
rg65 = fetch_RG65('../Data/Similarity/EN-RG-65.txt')
rw = fetch_RW('../Data/Similarity/RW.txt')
simlex = fetch_SimLex999('../Data/Similarity/EN-SIM999.txt')
tr9856 = fetch_TR9856('../Data/Similarity/TR9856.csv')

datasets = {
    "mturk" : mturk,
    "men" : men,
    "ws353" : ws353,
    "rg65" : rg65,
    "rw" : rw,
    "simlex" : simlex,
    "tr9856" : tr9856
}

In [21]:
def compute_similarity(embedding):
    results_df = pd.DataFrame(columns=["Dataset", "Spearman Correlation value"])
    
    for key, value in datasets.items():
        corr_val = evaluate_similarity(embedding, value['X'], value['y'])
        results_df.loc[len(results_df)] = [key, corr_val]
        
    return results_df

In [24]:
#GloVe
compute_similarity(glove)

Missing 24 words. Will replace them with mean vector
Missing 260 words. Will replace them with mean vector
Missing 12079 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.633182
1,men,0.737465
2,ws353,0.543524
3,rg65,0.769525
4,rw,0.367045
5,simlex,0.3705
6,tr9856,0.09948


In [23]:
#Fast Text
compute_similarity(fast_text)

Missing 2 words. Will replace them with mean vector
Missing 68 words. Will replace them with mean vector
Missing 12191 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.702289
1,men,0.790633
2,ws353,0.733276
3,rg65,0.846259
4,rw,0.513497
5,simlex,0.449965
6,tr9856,0.164246


In [25]:
#Lex Vec
compute_similarity(lex_vec)

Missing 1 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector
Missing 288 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 12087 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.648025
1,men,0.765485
2,ws353,0.63812
3,rg65,0.79326
4,rw,0.446721
5,simlex,0.36267
6,tr9856,0.131299


In [26]:
#ConceptNet Number batch
compute_similarity(concept_net)

Missing 24 words. Will replace them with mean vector
Missing 204 words. Will replace them with mean vector
Missing 12142 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.71881
1,men,0.86613
2,ws353,0.76822
3,rg65,0.924829
4,rw,0.568671
5,simlex,0.626787
6,tr9856,0.132052


## Analogy

### Google Analogy

Experiments (In terms of overall accuracy on all categories):<br/>
Per parameter results are presented as tuples of (Overall accuracy %, number of corrects, total number of words) <br/>
Results are provided with add rather than mul <br/>
<br/>

<b>GloVe:</b> <br/> 
<i>(Evaluation time: 15 mins on average)</i> <br/>
* 300d : 65%, 12723, 19544
* 200d : 60.8%, 11894, 19544
* 100d : 49.7% , 9730, 19544
* 50d  : 20.4%, 3997, 19544 

<b>FastText:</b> <br/>
<i>(Evaluation time: 20 mins)</i> <br/>
* 300d : 9.2%, 1815, 19544

<b>LexVec </b>
* 300d : 60.4%, 11805, 19544

<b> ConceptNet </b>
* 300d : 31.9%, 6242, 19544

In [3]:
google_analogy = fetch_google_analogy('../Data/Analogy/EN-GOOGLE.txt')

In [46]:
results = evaluate_analogy(concept_net, google_analogy['X'], google_analogy['y'], category=google_analogy['category'])

In [47]:
results

Unnamed: 0,accuracy,correct,count
all,0.319382,6242,19544
gram1-adjective-to-adverb,0.116935,116,992
gram8-plural,0.162162,216,1332
city-in-state,0.158087,390,2467
gram3-comparative,0.554054,738,1332
gram7-past-tense,0.291026,454,1560
gram2-opposite,0.096059,78,812
gram4-superlative,0.680036,763,1122
gram5-present-participle,0.242424,256,1056
gram6-nationality-adjective,0.537211,859,1599


## List of similarity datasets to be evaluated:
MTurk, MEN, WS353, Rubenstein and Goodenough, Rare Words, SimLex999, TR9856 

In [2]:
#Just downloading the files to compare and ensure I have the right datasets


# dataset = fetch_MTurk()  #Downloaded. Yet to be understood
dataset = fetch_MEN()   #Not Downloaded
# dataset = fetch_WS353("relatedness") #Downloaded. Difference between relatedness and attributional
# dataset = fetch_RG65() #Downloaded : Evaluate
# dataset = fetch_RW() #Downloaded : Downloaded

In [3]:
dataset

{'X': array([['sun', 'sunlight'],
        ['automobile', 'car'],
        ['river', 'water'],
        ...,
        ['muscle', 'tulip'],
        ['bikini', 'pizza'],
        ['bakery', 'zebra']], dtype=object), 'y': array([[10. ],
        [10. ],
        [ 9.8],
        ...,
        [ 0.2],
        [ 0.2],
        [ 0. ]])}