In [1]:
import pandas as pd
import sys
sys.path.append('../word-embeddings-benchmark')

from web.datasets.similarity import fetch_MTurk, fetch_MEN, fetch_WS353, fetch_RG65, fetch_RW, fetch_SimLex999, fetch_TR9856
from web.datasets.analogy import fetch_google_analogy, fetch_msr_analogy

#Important for converting embeddings to managable format
from web.embedding import Embedding
from web.embeddings import fetch_HPCA
from web.evaluate import evaluate_similarity, evaluate_analogy

## Embeddings
All embeddings are used without normalization

### GloVe:
400,000 words, 6B tokens with 50, 100, 200, 300 dimensions

### Fast Text
wiki-news 16B tokens with 300 dimensions

### LexVec
Note: LexVec Common Crawl 58B tokens , 300 dimensions was 5GB. Hence uable to load onto memory
Word + Context Vectors, 7B tokens, 300 dimensions

### ConceptNet Number batch
_ tokens, 300 dimensions


In [3]:
#Load time is about 5-10 mins
glove = Embedding.from_glove('../Data/Embeddings/GloVe/glove.6B.300d.txt', 400000, 300)

We ignored line number 400000 because of errors in parsing
index 400000 is out of bounds for axis 0 with size 400000


In [4]:
#Load time is about 10-15 mins
fast_text = Embedding.from_word2vec('../Data/Embeddings/FastText/wiki-news-300d-1M.vec')

In [5]:
#Load time is about 10 mins
lex_vec = Embedding.from_word2vec('../Data/Embeddings/LexVec/lexvec.enwiki+newscrawl.300d.W+C.pos.vectors')

In [6]:
#load time is about 10 mins
concept_net = Embedding.from_word2vec('../Data/Embeddings/ConceptNet/numberbatch-en.txt')

In [7]:
#Google news
google_news = Embedding.from_word2vec('../Data/Embeddings/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
#PDC
pdc = Embedding.from_word2vec('../Data/Embeddings/PDC/wikicorp.201004-pdc-iter-20-alpha-0.05-window-10-dim-300-neg-10-subsample-0.0001.txt')

In [9]:
#HDC
hdc = Embedding.from_word2vec('../Data/Embeddings/HDC/wikicorp.201004-hdc-iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt')

## Similarity Task

MTurk <br/>
Evaluated by 10 people on a scale of  1 - 5. Scores are multipled by 2

MEN <br/>
Pairs: 3000
Scores range : 
Natural form not lemmatized form. Scores were rescaled to stay within 0 - 10 range

WS353 <br/>
Pairs: 353
Scores range : 0 - 10

RG65 <br/>
Pairs: 65
Scores range : 0 - 4. Scaled by factor 10/4

Rare Words <br/>
Pairs: 2034
Scores range : 0 - 10

SimLex 999 <br/>
Pairs: 999
Scores range : 0 - 10

TR9856 <br/>
Pairs: 9856
Scores range : 0 - 1


| Embedding  | Dim  |  MTurk |  MEN  | WS353 | RG65 |  RW  | Sim999 |  TR9856  |  
| ---------- | ---- | ------ |
| Glove      |      |        |
| Glove      |      |        |
| FastText   | 300  | 0.7022 |
| LexVec     | 300  | 0.6480 |
| ConceptNet | 300  | 0.7188 |

In [15]:
#All Similiarty datasets
mturk = fetch_MTurk('../Data/Similarity/EN-TRUK.txt')
men = fetch_MEN('../Data/Similarity/EN-MEN-LEM.txt')
ws353 = fetch_WS353('../Data/Similarity/WS353_combined.csv')
rg65 = fetch_RG65('../Data/Similarity/EN-RG-65.txt')
rw = fetch_RW('../Data/Similarity/RW.txt')
simlex = fetch_SimLex999('../Data/Similarity/EN-SIM999.txt')
tr9856 = fetch_TR9856('../Data/Similarity/TR9856.csv')

datasets = {
    "mturk" : mturk,
    "men" : men,
    "ws353" : ws353,
    "rg65" : rg65,
    "rw" : rw,
    "simlex" : simlex,
    "tr9856" : tr9856
}

In [21]:
def compute_similarity(embedding):
    results_df = pd.DataFrame(columns=["Dataset", "Spearman Correlation value"])
    
    for key, value in datasets.items():
        corr_val = evaluate_similarity(embedding, value['X'], value['y'])
        results_df.loc[len(results_df)] = [key, corr_val]
        
    return results_df

In [24]:
#GloVe
compute_similarity(glove)

Missing 24 words. Will replace them with mean vector
Missing 260 words. Will replace them with mean vector
Missing 12079 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.633182
1,men,0.737465
2,ws353,0.543524
3,rg65,0.769525
4,rw,0.367045
5,simlex,0.3705
6,tr9856,0.09948


In [23]:
#Fast Text
compute_similarity(fast_text)

Missing 2 words. Will replace them with mean vector
Missing 68 words. Will replace them with mean vector
Missing 12191 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.702289
1,men,0.790633
2,ws353,0.733276
3,rg65,0.846259
4,rw,0.513497
5,simlex,0.449965
6,tr9856,0.164246


In [25]:
#Lex Vec
compute_similarity(lex_vec)

Missing 1 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector
Missing 288 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 12087 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.648025
1,men,0.765485
2,ws353,0.63812
3,rg65,0.79326
4,rw,0.446721
5,simlex,0.36267
6,tr9856,0.131299


In [26]:
#ConceptNet Number batch
compute_similarity(concept_net)

Missing 24 words. Will replace them with mean vector
Missing 204 words. Will replace them with mean vector
Missing 12142 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.71881
1,men,0.86613
2,ws353,0.76822
3,rg65,0.924829
4,rw,0.568671
5,simlex,0.626787
6,tr9856,0.132052


In [39]:
#Google News
compute_similarity(google_news)

Missing 12 words. Will replace them with mean vector
Missing 55 words. Will replace them with mean vector
Missing 216 words. Will replace them with mean vector
Missing 12282 words. Will replace them with mean vector


Unnamed: 0,Dataset,Spearman Correlation value
0,mturk,0.680567
1,men,0.758586
2,ws353,0.700017
3,rg65,0.760783
4,rw,0.49772
5,simlex,0.441966
6,tr9856,0.181381


## Analogy

### Google Analogy

Experiments (In terms of overall accuracy on all categories):<br/>
Per parameter results are presented as tuples of (Overall accuracy %, number of corrects, total number of words) <br/>
Results are provided with add rather than mul <br/>
<br/>

<b>GloVe:</b> <br/> 
<i>(Evaluation time: 15 mins on average)</i> <br/>
* 300d : 65%, 12723, 19544
* 200d : 60.8%, 11894, 19544
* 100d : 49.7% , 9730, 19544
* 50d  : 20.4%, 3997, 19544 

<b>FastText:</b> <br/>
<i>(Evaluation time: 20 mins)</i> <br/>
* 300d : 9.2%, 1815, 19544

<b>LexVec </b>
* 300d : 60.4%, 11805, 19544

<b> ConceptNet </b>
* 300d : 31.9%, 6242, 19544

In [12]:
def compute_analogy(embedding, dataset):
    return evaluate_analogy(embedding, dataset['X'], dataset['y'], category=dataset['category'])

In [27]:
google_analogy = fetch_google_analogy('../Data/Analogy/EN-GOOGLE.txt')

In [31]:
#GloVe
results = compute_analogy(glove, google_analogy)
results

Unnamed: 0,accuracy,correct,count
all,0.650993,12723,19544
gram8-plural,0.706456,941,1332
gram5-present-participle,0.528409,558,1056
currency,0.189376,164,866
capital-world,0.912246,4127,4524
gram4-superlative,0.628342,705,1122
city-in-state,0.602351,1486,2467
gram3-comparative,0.804805,1072,1332
gram9-plural-verbs,0.498851,434,870
gram2-opposite,0.20936,170,812


In [32]:
#Fast Text
results = compute_analogy(fast_text, google_analogy)
results

Missing 4667 words. Will replace them with mean vector


Unnamed: 0,accuracy,correct,count
all,0.092867,1815,19544
gram8-plural,0.0,0,1332
gram5-present-participle,0.171402,181,1056
currency,0.0,0,866
capital-world,0.0,0,4524
gram4-superlative,0.291444,327,1122
city-in-state,0.0,0,2467
gram3-comparative,0.444444,592,1332
gram9-plural-verbs,0.150575,131,870
gram2-opposite,0.1133,92,812


In [33]:
#Lex Vec
results = compute_analogy(lex_vec, google_analogy)
results

Unnamed: 0,accuracy,correct,count
all,0.604022,11805,19544
gram8-plural,0.560811,747,1332
gram5-present-participle,0.366477,387,1056
currency,0.271363,235,866
capital-world,0.862069,3900,4524
gram4-superlative,0.491979,552,1122
city-in-state,0.630726,1556,2467
gram3-comparative,0.755255,1006,1332
gram9-plural-verbs,0.391954,341,870
gram2-opposite,0.200739,163,812


In [34]:
#ConceptNet Number batch
results = compute_analogy(concept_net, google_analogy)
results

Unnamed: 0,accuracy,correct,count
all,0.319382,6242,19544
gram8-plural,0.162162,216,1332
gram5-present-participle,0.242424,256,1056
currency,0.128176,111,866
capital-world,0.388815,1759,4524
gram4-superlative,0.680036,763,1122
city-in-state,0.158087,390,2467
gram3-comparative,0.554054,738,1332
gram9-plural-verbs,0.243678,212,870
gram2-opposite,0.096059,78,812


In [40]:
#Google News
results = compute_analogy(google_news, google_analogy)
results

Missing 9402 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


Unnamed: 0,accuracy,correct,count
all,0.012331,241,19544
gram8-plural,0.04955,66,1332
gram5-present-participle,0.024621,26,1056
currency,0.006928,6,866
capital-world,0.001326,6,4524
gram4-superlative,0.001783,2,1122
city-in-state,0.021484,53,2467
gram3-comparative,0.008258,11,1332
gram9-plural-verbs,0.045977,40,870
gram2-opposite,0.0,0,812


In [42]:
#PDC
results = compute_analogy(pdc, google_analogy)
results

Unnamed: 0,accuracy,correct,count
all,0.380833,7443,19544
gram8-plural,0.491742,655,1332
gram5-present-participle,0.263258,278,1056
currency,0.027714,24,866
capital-world,0.478338,2164,4524
gram4-superlative,0.395722,444,1122
city-in-state,0.475882,1174,2467
gram3-comparative,0.53003,706,1332
gram9-plural-verbs,0.52069,453,870
gram2-opposite,0.221675,180,812


In [44]:
#HDC
results = compute_analogy(hdc, google_analogy)
results

  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


Unnamed: 0,accuracy,correct,count
all,0.288682,5642,19544
gram8-plural,0.205706,274,1332
gram5-present-participle,0.155303,164,1056
currency,0.124711,108,866
capital-world,0.403846,1827,4524
gram4-superlative,0.205882,231,1122
city-in-state,0.407377,1005,2467
gram3-comparative,0.292793,390,1332
gram9-plural-verbs,0.414943,361,870
gram2-opposite,0.158867,129,812


In [10]:
msr = fetch_msr_analogy('../Data/Analogy/EN-MSR.txt')

In [13]:
#GloVe
results = compute_analogy(glove, msr)
results

Missing 164 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


Unnamed: 0,accuracy,correct,count
all,0.514125,4113,8000
jjs_jj,0.35,175,500
jj_jjs,0.478,239,500
nnpos_nn,0.456,228,500
jjs_jjr,0.574,287,500
jj_jjr,0.596,298,500
nns_nn,0.518,259,500
vb_vbz,0.656,328,500
vbz_vb,0.78,390,500
nn_nns,0.68,340,500


In [14]:
#Fast Text
results = compute_analogy(fast_text, msr)
results

Unnamed: 0,accuracy,correct,count
all,0.1785,1428,8000
jjs_jj,0.09,45,500
jj_jjs,0.186,93,500
nnpos_nn,0.0,0,500
jjs_jjr,0.25,125,500
jj_jjr,0.312,156,500
nns_nn,0.0,0,500
vb_vbz,0.216,108,500
vbz_vb,0.418,209,500
nn_nns,0.0,0,500


In [15]:
#Lex Vec
results = compute_analogy(lex_vec, msr)
results

Missing 131 words. Will replace them with mean vector


Unnamed: 0,accuracy,correct,count
all,0.37075,2966,8000
jjs_jj,0.222,111,500
jj_jjs,0.45,225,500
nnpos_nn,0.276,138,500
jjs_jjr,0.472,236,500
jj_jjr,0.54,270,500
nns_nn,0.322,161,500
vb_vbz,0.408,204,500
vbz_vb,0.504,252,500
nn_nns,0.51,255,500


In [16]:
#Concept Net
results = compute_analogy(concept_net, msr)
results

Missing 302 words. Will replace them with mean vector


Unnamed: 0,accuracy,correct,count
all,0.4395,3516,8000
jjs_jj,0.602,301,500
jj_jjs,0.522,261,500
nnpos_nn,0.336,168,500
jjs_jjr,0.79,395,500
jj_jjr,0.43,215,500
nns_nn,0.39,195,500
vb_vbz,0.306,153,500
vbz_vb,0.704,352,500
nn_nns,0.048,24,500


In [17]:
#Google News
results = compute_analogy(google_news, msr)
results

Unnamed: 0,accuracy,correct,count
all,0.006625,53,8000
jjs_jj,0.004,2,500
jj_jjs,0.01,5,500
nnpos_nn,0.004,2,500
jjs_jjr,0.012,6,500
jj_jjr,0.022,11,500
nns_nn,0.008,4,500
vb_vbz,0.006,3,500
vbz_vb,0.0,0,500
nn_nns,0.004,2,500


In [18]:
#PDC
results = compute_analogy(pdc, msr)
results

Missing 241 words. Will replace them with mean vector


Unnamed: 0,accuracy,correct,count
all,0.26525,2122,8000
jjs_jj,0.068,34,500
jj_jjs,0.264,132,500
nnpos_nn,0.18,90,500
jjs_jjr,0.254,127,500
jj_jjr,0.28,140,500
nns_nn,0.196,98,500
vb_vbz,0.652,326,500
vbz_vb,0.39,195,500
nn_nns,0.298,149,500


In [19]:
#HDC
results = compute_analogy(hdc, msr)
results

Missing 241 words. Will replace them with mean vector


Unnamed: 0,accuracy,correct,count
all,0.1395,1116,8000
jjs_jj,0.026,13,500
jj_jjs,0.222,111,500
nnpos_nn,0.1,50,500
jjs_jjr,0.14,70,500
jj_jjr,0.192,96,500
nns_nn,0.092,46,500
vb_vbz,0.366,183,500
vbz_vb,0.144,72,500
nn_nns,0.146,73,500


In [46]:
#ConceptNet
results = evaluate_analogy(concept_net, google_analogy['X'], google_analogy['y'], category=google_analogy['category'])
results 

## List of similarity datasets to be evaluated:
MTurk, MEN, WS353, Rubenstein and Goodenough, Rare Words, SimLex999, TR9856 

In [2]:
#Just downloading the files to compare and ensure I have the right datasets


# dataset = fetch_MTurk()  #Downloaded. Yet to be understood
# dataset = fetch_MEN()   #Not Downloaded
# dataset = fetch_WS353("relatedness") #Downloaded. Difference between relatedness and attributional
# dataset = fetch_RG65() #Downloaded : Evaluate
# dataset = fetch_RW() #Downloaded : Downloaded

In [46]:
msr = fetch_msr_analogy()


Dataset created in C:\Users\johan/web_data\analogy/EN-MSR

