# Purpose
### The GoogleNews word2vec model is too large to fit in memory and it takes a long time to compute similarity scores. This notebook precomputes all the query terms.

There are 1657 query terms

The mean similarity score is 0.5746 and standard deviation is 0.0950

In [1]:
import gensim

In [2]:
%%time
# Load top 1 million words from Google's pre-trained Word2Vec model (3 billion words total)
model = gensim.models.KeyedVectors.load_word2vec_format('./languageModels/GoogleNews-vectors-negative300.bin', limit=1000000, binary=True)  

CPU times: user 24.3 s, sys: 1.98 s, total: 26.3 s
Wall time: 27.4 s


In [33]:
# Gather unique query terms
import re
terms = set()
for path in ['../datasets/cran/cran.qry', '../datasets/adi/ADI.QRY', 
             '../datasets/med/MED.QRY', '../datasets/time/TIME_clean.QUE']:
    with open(path, 'r') as qfile:
        for line in qfile:
            if '.I' not in line or '.W' not in line:
                terms |= set(re.split('[^a-zA-Z]+', line))
# Remove stop words
stop_words = {"a", "about", "above", "all", "along","also", "although", "am", "an", "and", "any", "are", "aren't", "as", "at","be", "because", "been", "but", "by", "can", "cannot", "could", "couldn't","did", "didn't", "do", "does", "doesn't", "e.g.", "either", "etc", "etc.","even", "ever", "enough", "for", "from", "further", "get", "gets", "got", "had", "have","hardly", "has", "hasn't", "having", "he", "hence", "her", "here","hereby", "herein", "hereof", "hereon", "hereto", "herewith", "him","his", "how", "however", "i", "i.e.", "if", "in", "into", "it", "it's", "its","me", "more", "most", "mr", "my", "near", "nor", "now", "no", "not", "or", "on", "of", "onto","other", "our", "out", "over", "really", "said", "same", "she","should", "shouldn't", "since", "so", "some", "such","than", "that", "the", "their", "them", "then", "there", "thereby","therefore", "therefrom", "therein", "thereof", "thereon", "thereto","therewith", "these", "they", "this", "those", "through", "thus", "to","too", "under", "until", "unto", "upon", "us", "very", "was", "wasn't","we", "were", "what", "when", "where", "whereby", "wherein", "whether","which", "while", "who", "whom", "whose", "why", "with", "without","would", "you", "your", "yours", "yes"}
terms -= stop_words
# Remove small words
min_word_len = 2
terms = {x.lower() for x in terms if len(x) >= min_word_len}

In [46]:
# Gather and stem all similar words
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
table = {}
count = 1
total = len(terms)
print('Total number of terms = ' + str(total))
for term in terms:
    try:
        sims = model.most_similar(term)
        stemmed = []
        for sim, score in sims:
            stem_word = stemmer.stem(sim)
            if stem_word not in [x for x,_ in stemmed] and '_' not in stem_word and stem_word != stemmer.stem(term):
                stemmed.append((stem_word, score))
        table[stemmer.stem(term)] = stemmed
    except:        
        table[stemmer.stem(term)] = []
    print(count, end=', ')
    count += 1

Total number of terms = 1657


1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222

{'monkey': [('gorilla', 0.6353992819786072),
  ('ape', 0.6170221567153931),
  ('snake', 0.6056618094444275),
  ('chimp', 0.591574490070343),
  ('rabbit', 0.5765780210494995),
  ('lizard', 0.5755525827407837),
  ('frog', 0.5715548992156982),
  ('baboon', 0.567928671836853)],
 'good': [('great', 0.7291509509086609),
  ('bad', 0.7190051078796387),
  ('terrif', 0.6889115571975708),
  ('decent', 0.6837348341941833),
  ('nice', 0.6836091876029968),
  ('excel', 0.6442928910255432),
  ('fantast', 0.6407778859138489),
  ('better', 0.6120729446411133),
  ('solid', 0.5806034803390503),
  ('lousi', 0.5764203071594238)],
 'blood': [('urin', 0.5653170943260193),
  ('transfus', 0.5178364515304565),
  ('platelet', 0.5156949758529663),
  ('marrow', 0.4987516701221466),
  ('saliva', 0.49781787395477295)],
 'correspond': [('preced', 0.5118293762207031),
  ('approxim', 0.49637675285339355),
  ('compar', 0.4875316619873047),
  ('sameperiod', 0.48180532455444336)],
 'function': [('mechan', 0.436637520790100

In [None]:
import json
file_name = 'substitutions_word2vec.json'
with open(file_name, 'w') as f:
    f.write(json.dumps(table))