In [17]:
import numpy as np
import random
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from nltk import FreqDist, download
from nltk.corpus import brown, stopwords
import nltk
import pandas

In [14]:
download('brown')
download('stopwords')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/francisfurnelli/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/francisfurnelli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Generate word list of most common English words
Load embedding model (Word2Vec)

In [126]:
frequency_list = FreqDist(word for word in brown.words() if word.islower())
top_20000 = [word for word,_ in frequency_list.most_common(20000)]
top_10000 = [word for word,_ in frequency_list.most_common(10000)]
top_5000 = [word for word,_ in frequency_list.most_common(5000)]
embedding_model = api.load('word2vec-google-news-300')
word_list = [word for word in top_20000 if word in embedding_model]

In [18]:
stopwords = set(stopwords.words("english"))
print(len(stopwords))

198


Create list of Semantle target words (from archive games)

In [32]:
df = pandas.read_csv("semantle.csv")
target_words=df['answer'].to_list()
print(target_words)

['forever', 'executive', 'elevator', 'patience', 'overnight', 'belt', 'eager', 'boil', 'favorite', 'candle', 'crush', 'vital', 'patent', 'display', 'most', 'tail', 'latter', 'surgeon', 'monthly', 'consume', 'plea', 'miracle', 'level', 'seek', 'world', 'costly', 'historian', 'appreciation', 'ideological', 'piano', 'spy', 'suitable', 'liver', 'bold', 'value', 'bat', 'selection', 'field', 'surrounding', 'generous', 'speaker', 'manager', 'pork', 'greet', 'track', 'circle', 'area', 'objective', 'assemble', 'pot', 'resist', 'curriculum', 'craft', 'shot', 'rocket', 'cooperate', 'nod', 'ugly', 'newspaper', 'soar', 'principle', 'democratic', 'present', 'vendor', 'direct', 'cooperation', 'athletic', 'absolute', 'surprise', 'asset', 'button', 'bar', 'prevail', 'apple', 'title', 'gathering', 'recruit', 'grateful', 'philosophical', 'horse', 'mother', 'note', 'contribution', 'lend', 'roll', 'gather', 'cliff', 'spill', 'alcohol', 'equality', 'headphones', 'offensive', 'together', 'poem', 'booth', 'gr

Compute average similarity of each word in words, relative to target words
i.e. how similar on average is a given word to the target words?

In [30]:
def compute_avg_sim(words):
    """ Compute the average similarity of each word relative to archive target words"""
    similarity = {}
    for word in words:
        similarity[word] = []
        for target in target_words:
            similarity[word].append(embedding_model.similarity(target, word))
    avg_sim = {word: sum(sim)/len(sim) for word,sim in similarity.items()}
    return avg_sim

Run experiment to find 50 non-stopword words that are most similar to target words on average
Note: takes about 6 minutes to run on ~20,000 words and ~1,000 target words

In [88]:
def starting_words_experiment():
    """ Run experiment to find top 50 words that are most similar to all other words (excluding stopwords) """
    stopwords_removed = [word for word in word_list if word not in stopwords and "'" not in word]
    avg_sim = compute_avg_sim(stopwords_removed)
    key = lambda x: x[1]
    top_50 = sorted(avg_sim.items(), key=key, reverse=True)[:50]
    print(top_50)

In [None]:
starting_words_experiment()

[('anyway', 0.14740422730095282), ('kind', 0.1458793758603586), ('even', 0.14556949784057424), ('something', 0.14321604931706483), ('sort', 0.14140738599105399), ('actually', 0.1411353638493512), ('really', 0.14093228634735402), ('think', 0.1389849579056979), ('simply', 0.13816469241727197), ('know', 0.13698726146689402), ('certainly', 0.1364448555985316), ('probably', 0.13564050060978053), ('guess', 0.1350873400889817), ('always', 0.1339445571073743), ('whatever', 0.13251194475021166), ('obviously', 0.1313012669583895), ('want', 0.13081291222376357), ('suppose', 0.1306406047859043), ('however', 0.13059699088658525), ('subjective', 0.1301808673811567), ('maybe', 0.13008795968003992), ('never', 0.1290132349607262), ('little', 0.12585410348370438), ('somebody', 0.12526854654524666), ('pretend', 0.1248132956947486), ('get', 0.1244708638114701), ('anything', 0.12443192541580005), ('definitely', 0.12424929543846809), ('like', 0.12408517486341222), ('indeed', 0.12398073668501322), ('though',

Experiment 2: K-means clustering and Nearest Neighbors
K-means++ works better because it gets a more even spread across the data
The idea is to find word clusters that are sufficiently diverse

In [131]:
from sklearn.cluster import kmeans_plusplus
import numpy as np
from sklearn.neighbors import NearestNeighbors


In [None]:
def cluster():
    stopwords_removed = [word for word in word_list if word not in stopwords and "'" not in word]
    embeddings = embedding_model[stopwords_removed]
    print(stopwords_removed)
    centers, indices = kmeans_plusplus(embeddings, n_clusters=16, random_state=0)#.fit(embeddings)
    #centers = clustering.centers
    knn = NearestNeighbors(n_neighbors=50, algorithm='auto')
    knn.fit(embeddings)
    for i, vec in enumerate(centers):
        distances, indices = knn.kneighbors(vec.reshape(1, -1))
        for id in indices[0]:
            print(stopwords_removed[id])
        print()
cluster()


captive
rhinotracheitis
unwarrantable
cu.
freed

clothed
dressed
organdy
naked
chinning

flock
flocked
rhinotracheitis
compulsives
ordo

essays
essay
poems
writings
poetry

statement
letter
report
saying
said

alloy
alloys
aluminum
steel
metallic

stretched
stretching
extended
squeezed
stretches

sing
singing
sang
sings
sung

hopelessly
horribly
woefully
utterly
perpetually

meet
met
meets
meeting
fulfill

weird
strange
crazy
odd
scary

learning
teaching
learn
learned
teach

tube
tubes
tappet
oxygens
floc

periodic
periodically
frequent
subsequent
continual

condemn
denounce
condemning
condemned
criticize

coordinated
coordinate
coordinating
organized
concerted

