In [1]:
# import relevant packages
import numpy as np
import nltk
from nltk.corpus import brown, wordnet as wn, stopwords
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse.linalg import svds
from collections import defaultdict
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
import numpy as np
import string

We are using the brown dataset (which contains around 1M tokens) and the wordnet knowledge base to create embeddings

In [2]:
# download datasets

nltk.download("brown")
nltk.download("wordnet")

[nltk_data] Downloading package brown to
[nltk_data]     /Users/encrypted_soul/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/encrypted_soul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Loading human annotated datasets which will be later used for evaluation

In [3]:
simlex_path = 'human_annoted_scores/SimLex-999/SimLex-999.txt'
simlex_df = pd.read_csv(simlex_path, sep='\t')

In [4]:
file_path = 'human_annoted_scores/wordsim353crowd.csv'
wordsim_df = pd.read_csv(file_path)

In [5]:
simlex_df.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


In [6]:
wordsim_df.head()

Unnamed: 0,Word 1,Word 2,Human (Mean)
0,admission,ticket,5.536
1,alcohol,chemistry,4.125
2,aluminum,metal,6.625
3,announcement,effort,2.0625
4,announcement,news,7.1875


Utility functions used for enriching with wordnet data

In [7]:
def synset_depth(word):
    depths = [synset.min_depth() for synset in wn.synsets(word)]
    return np.mean(depths) if depths else 0

def hypernym_count(word):
    return sum(len(synset.hypernyms()) for synset in wn.synsets(word))

def hyponym_count(word):
    return sum(len(synset.hyponyms()) for synset in wn.synsets(word))

def meronym_holonym_count(word):
    meronyms = sum(len(synset.part_meronyms()) + len(synset.substance_meronyms()) for synset in wn.synsets(word))
    holonyms = sum(len(synset.part_holonyms()) + len(synset.substance_holonyms()) for synset in wn.synsets(word))
    return meronyms, holonyms

def has_antonym(word):
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            if lemma.antonyms():
                return 1
    return 0

Function to enrich embeddings with wordnet features

In [8]:
def enrich_wordnet_features(word_vecs):
    enriched_vecs = defaultdict(list)
    for word, vec in word_vecs.items():
        features = [
            len(wn.synsets(word)),
            synset_depth(word),
            hypernym_count(word),
            hyponym_count(word),
            *meronym_holonym_count(word),
            has_antonym(word)
        ]
        enriched_vecs[word] = np.concatenate([vec, features])
    return enriched_vecs


In [9]:
def get_cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

In [10]:
# Load the Brown Corpus and count the tokens to verify that the corpus has around 1M tokens
brown_tokens = brown.words()
num_tokens = len(brown_tokens)

print(f"The Brown Corpus contains {num_tokens} tokens.")

The Brown Corpus contains 1161192 tokens.


In [11]:
# Checking the structure of sentences
brown_corpus = brown.sents()
brown_corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [12]:
sentences = [" ".join(sent) for sent in brown_corpus]
vectorizer = CountVectorizer(analyzer="word", token_pattern=r"\b\w+\b", min_df=1)
X = vectorizer.fit_transform(sentences)
sentences[100]

'Daniel personally led the fight for the measure , which he had watered down considerably since its rejection by two previous Legislatures , in a public hearing before the House Committee on Revenue and Taxation .'

In [13]:
dense_X = X.todense()

print("Shape of the co-occurrence matrix (A, B) where A is the number of unique sentences and B is the number of unique tokens:", dense_X.shape)

Shape of the co-occurrence matrix (A, B) where A is the number of unique sentences and B is the number of unique tokens: (57340, 42432)


In [14]:
X_float = csr_matrix(X, dtype=np.float64)
# reducing the dimension of the matrix
u, s, vt = svds(X_float, k=100)

In [15]:
words = vectorizer.get_feature_names_out()

In [18]:
# Map words to their vector representations
word_vecs = {word: vt[:, i] for i, word in enumerate(vectorizer.get_feature_names_out())}

In [19]:
enriched_word_vecs = enrich_wordnet_features(word_vecs)

# Example: Access enriched vector for 'dog'
print(enriched_word_vecs['dog'])
print(len(enriched_word_vecs['dog']))

[-7.28757506e-04 -3.54495918e-03 -3.84851971e-04  4.33727754e-03
 -1.24188563e-03 -1.92492773e-03  1.59946444e-03  1.87134374e-03
 -2.51345921e-04 -8.14635967e-04  8.63220753e-04 -1.67919458e-04
  1.35632261e-03  4.30029795e-05 -1.61946393e-03 -2.20584179e-03
 -1.26405196e-03 -5.56515921e-04  2.67273927e-03  9.67846535e-04
 -9.14451485e-04  1.40411393e-03  1.05480906e-03  1.06223881e-03
 -4.38454834e-04  2.59497214e-04 -1.17849114e-03 -5.15815387e-04
  1.38160853e-03  9.01050822e-04  2.31521988e-04  7.25737582e-04
 -4.92206423e-04  1.06031907e-04 -1.82074803e-03  1.51335466e-03
  4.64471263e-04 -3.55045689e-04  7.91528813e-04 -5.52454731e-04
 -1.37833998e-03 -2.05704898e-03  5.74992479e-04 -3.74322914e-04
 -3.57905685e-04 -9.07370164e-05 -5.88274640e-04  3.05916056e-04
  8.99795412e-04 -2.44268757e-04 -7.85604194e-04  1.55800712e-03
  3.28604257e-04  1.07089846e-05  7.46523256e-04 -1.60337490e-03
  3.81408137e-04 -2.23378820e-04 -1.78856156e-04  5.76030321e-04
  3.28439876e-04 -1.59798

In [20]:
computed_similarities = []
for _, row in simlex_df.iterrows():
    word1, word2 = row['word1'], row['word2']
    if word1 in enriched_word_vecs and word2 in enriched_word_vecs:
        vec1 = enriched_word_vecs[word1]
        vec2 = enriched_word_vecs[word2]
        similarity = get_cosine_similarity(vec1, vec2)
        computed_similarities.append(similarity)
    else:
        computed_similarities.append(np.nan)  # If one of the words is not in the embeddings, treat as missing

In [21]:
simlex_df['computed_similarity'] = computed_similarities

In [22]:
simlex_df.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),computed_similarity
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41,0.988567
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67,0.730278
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19,0.913758
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18,0.976183
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93,0.999776


In [23]:
filtered_simlex_df = simlex_df.dropna(subset=['computed_similarity'])
spearman_corr, _ = spearmanr(filtered_simlex_df['SimLex999'], filtered_simlex_df['computed_similarity'])
print(f"Spearman Correlation between computed similarities and human judgments: {spearman_corr}")

Spearman Correlation between computed similarities and human judgments: 0.060776462864572396


Seems like we got an extremely bad score with the embeddings we just created with some simple feature engineering of wordnet knowledge base. Lets build another embedding using a simple correlation matrix 

In [24]:
corpus = brown.words()
corpus = [word.lower() for word in corpus]
stop_words = set(stopwords.words('english'))
corpus = [word for word in corpus if word not in stop_words and word not in string.punctuation]
vocabulary = sorted(set(corpus))
word2idx = {word: idx for idx, word in enumerate(vocabulary)}

# Create a co-occurence matrix
window_size = 3
co_occurrence_matrix = np.zeros((len(vocabulary), len(vocabulary)))

for i in range(len(corpus) - window_size + 1):
    window = corpus[i:i+window_size]
    for j in range(window_size):
        for k in range(window_size):
            if j != k:
                word1 = window[j]
                word2 = window[k]
                idx1 = word2idx[word1]
                idx2 = word2idx[word2]
                co_occurrence_matrix[idx1, idx2] += 1

# Calculate similarity scores for word pairs in SimLex-999
computed_similarities = []
for _, row in simlex_df.iterrows():
    word1, word2 = row['word1'], row['word2']
    if word1 in word2idx and word2 in word2idx:
        idx1 = word2idx[word1]
        idx2 = word2idx[word2]
        vec1 = co_occurrence_matrix[idx1]
        vec2 = co_occurrence_matrix[idx2]
        similarity = get_cosine_similarity(vec1, vec2)
        computed_similarities.append(similarity)
    else:
        computed_similarities.append(np.nan)

simlex_df['computed_similarity'] = computed_similarities
filtered_simlex_df = simlex_df.dropna(subset=['computed_similarity'])
spearman_corr, _ = spearmanr(filtered_simlex_df['SimLex999'], filtered_simlex_df['computed_similarity'])
print(f"Spearman's rank correlation coefficient: {spearman_corr}")

Spearman's rank correlation coefficient: -0.06221871077974628


In [31]:
class Word2Vec:
    def __init__(self, sentences, vector_size, window_size, learning_rate, epochs):
        self.sentences = sentences
        self.vector_size = vector_size
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.word_counts = defaultdict(int)
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_vectors = None

    def preprocess(self):
        vocab = set()
        for sentence in self.sentences:
            for word in sentence:
                self.word_counts[word] += 1
                vocab.add(word)

        self.word_to_index = {word: index for index, word in enumerate(vocab)}
        self.index_to_word = {index: word for word, index in self.word_to_index.items()}
        self.vocab_size = len(vocab)
        self.word_vectors = np.random.uniform(-0.5, 0.5, (self.vocab_size, self.vector_size))

    def train(self):
        for epoch in range(self.epochs):
            for sentence in self.sentences:
                for center_word_pos, center_word in enumerate(sentence):
                    center_word_index = self.word_to_index[center_word]
                    context_words = self.get_context_words(sentence, center_word_pos)

                    for context_word in context_words:
                        context_word_index = self.word_to_index[context_word]
                        self.update_weights(center_word_index, context_word_index)

    def train_with_wordnet(self):
        for epoch in range(self.epochs):
            for sentence in self.sentences:
                for center_word_pos, center_word in enumerate(sentence):
                    center_word_index = self.word_to_index[center_word]
                    context_words = self.get_context_words(sentence, center_word_pos)

                    for context_word in context_words:
                        context_word_index = self.word_to_index[context_word]
                        self.update_weights(center_word_index, context_word_index)
                    synsets = wn.synsets(center_word)
                    for synset in synsets:
                        for lemma in synset.lemmas():
                            if lemma.name() in self.word_to_index:
                                related_word_index = self.word_to_index[lemma.name()]
                                self.update_weights(center_word_index, related_word_index)
                                self.update_weights(related_word_index, center_word_index)


    def get_context_words(self, sentence, center_word_pos):
        start = max(0, center_word_pos - self.window_size)
        end = min(len(sentence), center_word_pos + self.window_size + 1)
        return [word for word in sentence[start:end] if word != sentence[center_word_pos]]

    def update_weights(self, center_word_index, context_word_index):
        center_vector = self.word_vectors[center_word_index]
        context_vector = self.word_vectors[context_word_index]

        error = 1 - np.dot(center_vector, context_vector)
        gradient = -error * context_vector
        context_gradient = -error * center_vector

        self.word_vectors[center_word_index] -= self.learning_rate * gradient
        self.word_vectors[context_word_index] -= self.learning_rate * context_gradient

    def most_similar(self, word, top_n=5):
        word_index = self.word_to_index[word]
        word_vector = self.word_vectors[word_index]

        similarities = np.dot(self.word_vectors, word_vector)
        top_indices = np.argsort(similarities)[-top_n-1:-1][::-1]

        similar_words = [(self.index_to_word[index], similarities[index]) for index in top_indices]
        return similar_words

In [45]:
sentences = [list(sent) for sent in brown.sents()]
vector_size = 100
window_size = 10
learning_rate = 0.01
epochs = 10

model = Word2Vec(sentences, vector_size, window_size, learning_rate, epochs)
model.preprocess()
model.train_with_wordnet()

simlex_path = 'human_annoted_scores/SimLex-999/SimLex-999.txt'
simlex_df = pd.read_csv(simlex_path, sep='\t')

computed_similarities = []
for _, row in simlex_df.iterrows():
    word1, word2 = row['word1'], row['word2']
    if word1 in model.word_to_index and word2 in model.word_to_index:
        idx1 = model.word_to_index[word1]
        idx2 = model.word_to_index[word2]
        vec1 = model.word_vectors[idx1]
        vec2 = model.word_vectors[idx2]
        similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        computed_similarities.append(similarity)
    else:
        computed_similarities.append(np.nan)

simlex_df['computed_similarity'] = computed_similarities

filtered_simlex_data = simlex_df.dropna(subset=['computed_similarity'])
spearman_corr, _ = spearmanr(filtered_simlex_data['SimLex999'], filtered_simlex_data['computed_similarity'])
print(f"Spearman's rank correlation coefficient: {spearman_corr:.4f}")

Spearman's rank correlation coefficient: -0.0869


##### Word Similarity Scores: Unconstrained

We will be using the spearman's coefficient to evaluate the model with Wordsim-354 and SimLex-999 human annotated datasets. We will be using the glove model with 6B tokens and 50, 100, 200 and 300 vector dimension sizes for numeric representation of words

In [22]:
def load_glove_embeddings(path):
    """
    Load GloVe embeddings from a file into a dictionary.

    This function reads a GloVe embeddings file where each line corresponds to a word
    and its vector representation. The function parses each line to extract the word and its
    corresponding vector, and then stores these in a dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file. The file is expected to be in the
                  format where each line contains a word followed by its embedding vector,
                  with spaces as separators.

    Returns:
    - dict: A dictionary where keys are words (str) and values are their corresponding vector
            embeddings (numpy arrays of dtype 'float32').
    """
    with open(path, 'r', encoding='utf-8') as f:
        embeddings = {}
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [23]:
def calculate_spearman_correlation_with_simlex999(simlex_df, glove_embeddings):
    """
    Calculate Spearman correlation between human-labeled similarity scores and computed cosine similarity scores.

    Parameters:
    - simlex_df: DataFrame with columns 'word1', 'word2', and 'SimLex999' containing similarity scores.
    - glove_embeddings: Dictionary mapping words to their vector embeddings.

    Returns:
    - spearman_corr: Spearman correlation coefficient.
    """
    computed_similarities = []
    for _, row in simlex_df.iterrows():
        word1, word2 = row['word1'], row['word2'].lower()
        if word1 in glove_embeddings and word2 in glove_embeddings:
            vec1 = glove_embeddings[word1]
            vec2 = glove_embeddings[word2]
            similarity = get_cosine_similarity(vec1, vec2)  # Using the same cosine_similarity function as before
            computed_similarities.append(similarity)
        else:
            computed_similarities.append(np.nan)

    simlex_df['computed_similarity'] = computed_similarities
    filtered_simlex_df = simlex_df.dropna(subset=['computed_similarity'])
    spearman_corr, _ = spearmanr(filtered_simlex_df['SimLex999'], filtered_simlex_df['computed_similarity'])
    return spearman_corr

In [24]:
def calculate_spearman_correlation_with_wordsim353(wordsim_df, glove_embeddings):
    """
    Calculate Spearman correlation between human-labeled similarity scores and computed cosine similarity scores.

    Parameters:
    - wordsim_df: DataFrame with columns 'Word 1', 'Word 2', and 'Human (Mean)' containing similarity scores.
    - glove_embeddings: Dictionary mapping words to their vector embeddings.

    Returns:
    - spearman_corr: Spearman correlation coefficient.
    """
    computed_similarities = []
    for _, row in wordsim_df.iterrows():
        word1, word2 = row['Word 1'].lower(), row['Word 2'].lower()
        if word1 in glove_embeddings and word2 in glove_embeddings:
            vec1 = glove_embeddings[word1]
            vec2 = glove_embeddings[word2]
            similarity = get_cosine_similarity(vec1, vec2)
            computed_similarities.append(similarity)
        else:
            computed_similarities.append(np.nan)

    wordsim_df['computed_similarity'] = computed_similarities
    filtered_wordsim_df = wordsim_df.dropna(subset=['computed_similarity'])
    spearman_corr, _ = spearmanr(filtered_wordsim_df['Human (Mean)'], filtered_wordsim_df['computed_similarity'])
    return spearman_corr

In [25]:
glove_paths = [
    'glove.6B/glove.6B.50d.txt',
    'glove.6B/glove.6B.100d.txt',
    'glove.6B/glove.6B.200d.txt',
    'glove.6B/glove.6B.300d.txt'
]

for path in glove_paths:
    glove_embeddings = load_glove_embeddings(path=path)
    print(f'The Spearman Correlation Value with SimLex-999 and {path}: {calculate_spearman_correlation_with_simlex999(simlex_df, glove_embeddings)}')

The Spearman Correlation Value with SimLex-999 and glove.6B/glove.6B.50d.txt: 0.2645792192990813
The Spearman Correlation Value with SimLex-999 and glove.6B/glove.6B.100d.txt: 0.29755501657418554
The Spearman Correlation Value with SimLex-999 and glove.6B/glove.6B.200d.txt: 0.3402590362173856
The Spearman Correlation Value with SimLex-999 and glove.6B/glove.6B.300d.txt: 0.37050035710869067


In [26]:
glove_paths = [
    'glove.6B/glove.6B.50d.txt',
    'glove.6B/glove.6B.100d.txt',
    'glove.6B/glove.6B.200d.txt',
    'glove.6B/glove.6B.300d.txt'
]

for path in glove_paths:
    glove_embeddings = load_glove_embeddings(path=path)
    print(f'The Spearman Correlation Value with Wordsim-353 and {path}: {calculate_spearman_correlation_with_wordsim353(wordsim_df, glove_embeddings)}')

The Spearman Correlation Value with Wordsim-353 and glove.6B/glove.6B.50d.txt: 0.44779770696777327
The Spearman Correlation Value with Wordsim-353 and glove.6B/glove.6B.100d.txt: 0.4783764811477666
The Spearman Correlation Value with Wordsim-353 and glove.6B/glove.6B.200d.txt: 0.5160682763664167
The Spearman Correlation Value with Wordsim-353 and glove.6B/glove.6B.300d.txt: 0.5433294686723303
