In [49]:
import pandas as pd
import json
import gensim.downloader
import os
import random
random.seed(42)

In [2]:
vectors = gensim.downloader.load('word2vec-google-news-300')

In [17]:
# Get words from 20q dev set
dev_20q = pd.read_json(path_or_buf='../data/twentyquestions/twentyquestions-dev.jsonl', lines=True)
print(len(dev_20q))
words_20q = dev_20q["subject"].unique().tolist()
print(len(words_20q))

15403
4655


In [31]:
# Filter words not in the pretrained word2vec list
words_20q = [w for w in words_20q if vectors.has_index_for(w)]
len(words_20q)

4514

In [32]:
assert all(len(w.split())==1 for w in words_20q)

In [33]:
test_words = ['computer', 'papa', 'trees', 'child', 'sax', 'crane', 'meatloaf', 'birthstone', 'polyethylene', 'cement']

In [34]:
assert all([w in words_20q for w in test_words])

In [50]:
dataset_sizes = [2000, 3000, 4000]
datatsets = {}
for sz in dataset_sizes:
    words_sz = random.sample(list(set(words_20q) - set(test_words)), sz - len(test_words))
    words_sz += test_words
    random.shuffle(words_sz)
    assert len(words_sz) == sz
    datatsets[sz] = words_sz

In [56]:
for sz, dataset in datatsets.items():
    os.makedirs(f'../data/twentyquestions/datasets/word2vec-{sz}', exist_ok=True)
    for test in test_words:
        similarities = []
        # Compute cosine sims
        for w in dataset:
            sim = vectors.similarity(test, w)
            similarities.append(sim)
        # 0-1 normalization
        similarities = [(sim - min(similarities)) / (max(similarities) - min(similarities)) for sim in similarities]
        tups = list(zip(dataset, similarities))
        # Sort list in descending order of similarity
        tups.sort(key=lambda x: -x[1])
        word_dataset = [{'Words': t[0], 'Similarity': t[1]} for t in tups]
        # Save dataset as a csv
        pd.DataFrame(word_dataset).to_csv(f'../data/twentyquestions/datasets/word2vec-{sz}/{test}.csv', index=False)