In [1]:
# install requirements
%pip install wget
%pip install gensim
%pip install tqdm
%pip install pandas
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnoti

In [2]:
# import required packages
import gensim
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd



In [3]:
# load models word2vec and glove
# download and extract the files mentioned to the root of the directory
# downloaded from https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
word2vec = KeyedVectors.load_word2vec_format("wiki-news-300d-1M.vec", binary=False)
# downloaded from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM
glove = KeyedVectors.load_word2vec_format("glove.6B.100d.txt", no_header=True, binary=False)

In [4]:
# list of all relations we need to test.
relations_to_test = [
    'capital-world', 'currency', 'city-in-state', 'family',
    'gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative', 'gram6-nationality-adjective'
]

# we store the processed analogy data we read from the text file in this.
analogy_data = {}

relation = None
# read analogy data from the file
with open('word-test.v1.txt', 'r', encoding='utf-8') as f:
    for line in f:
        if line.startswith(':'):        
            relation = line.strip()
            relation = relation[2:].strip()
            if relation not in relations_to_test:
                relation = None
            else:
                analogy_data[relation] = []

        if relation in analogy_data:
            words = line.lower().split()
            if len(words) == 4:
                a, b, c, d_true = words
                analogy_data[relation].append((a, b, c, d_true))


In [5]:
# Analogy Prediction
def predict_analogy(embeddings, a, b, c):
    if a not in embeddings.key_to_index or b not in embeddings.key_to_index or c not in embeddings.key_to_index:
        return None

    diff = embeddings[b] - embeddings[a] + embeddings[c]
    diff_norm = diff / np.linalg.norm(diff) # pre-normalize diff

    # Get all word embeddings and normalize them (optional, but can be beneficial)
    all_embeddings = embeddings.vectors
    all_embeddings_norm = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)

    # Vectorized cosine similarity calculation
    cosine_similarities = cosine_similarity(all_embeddings_norm, diff_norm.reshape(1, -1)).flatten()

    max_similarity_index = -1
    max_cosine_similairty = -1
    predicted_d = None

    exclude_indices = {embeddings.key_to_index[a], embeddings.key_to_index[b], embeddings.key_to_index[c]}

    # loop through the whole vocab to find the max similarity word.
    for index, similarity in enumerate(cosine_similarities):
        if index not in exclude_indices: # Exclude a, b, c
            if max_cosine_similairty < similarity:
                max_cosine_similairty = similarity
                max_similarity_index = index

    if max_similarity_index != -1:
        predicted_d = embeddings.index_to_key[max_similarity_index]

    return predicted_d

# Load Analogy Dataset and Run Tests

def run_analogy_test(embeddings):
    relation_accuracies = {}
    overall_correct_count = 0
    overall_total_count = 0

    # loop throguth analogy_data and run analogy test on all of them
    for relation in analogy_data:
        relation_accuracies[relation] = {'correct': 0, 'total': 0}
        print(f"running analogy test for {relation}")
        # running on only 1000 max as this is a large dataset and 1000 will caputure accuracy efficiently.
        for word_tuple in tqdm(analogy_data[relation][:min(1000, len(analogy_data[relation]))]):
            a, b, c, d_true = word_tuple
            predicted_d = predict_analogy(embeddings, a, b, c)

            if predicted_d == d_true: # Correct prediction
                relation_accuracies[relation]['correct'] += 1
                overall_correct_count += 1
            relation_accuracies[relation]['total'] += 1
            overall_total_count += 1

    print("\n--- Accuracy per Relation ---")
    for relation, counts in relation_accuracies.items():
        accuracy = (counts['correct'] / counts['total']) * 100 if counts['total'] > 0 else 0.0
        print(f"{relation}: Accuracy = {accuracy:.2f}% ({counts['correct']}/{counts['total']})")

    overall_accuracy = (overall_correct_count / overall_total_count) * 100 if overall_total_count > 0 else 0.0
    print(f"\nOverall Accuracy: {overall_accuracy:.2f}% ({overall_correct_count}/{overall_total_count})")
    return relation_accuracies, overall_accuracy

print("Running analogy test with GloVe...")
glove_relation_accuracies, glove_overall_accuracy = run_analogy_test(glove)

print("Running analogy test with Word2Vec...")
word2vec_relation_accuracies, word2vec_overall_accuracy = run_analogy_test(word2vec)

print("\n--- Accuracy Comparison Table ---")
print("{:<30} {:<15} {:<15}".format('Relation', 'GloVe Accuracy', 'Word2Vec Accuracy'))
print("-" * 60)
for relation in relations_to_test:
    glove_acc = (glove_relation_accuracies[relation]['correct'] / glove_relation_accuracies[relation]['total']) * 100 if glove_relation_accuracies[relation]['total'] > 0 else 0.0
    w2v_acc = (word2vec_relation_accuracies[relation]['correct'] / word2vec_relation_accuracies[relation]['total']) * 100 if word2vec_relation_accuracies[relation]['total'] > 0 else 0.0
    print("{:<30} {:<15.2f}% {:<15.2f}%".format(relation, glove_acc, w2v_acc))
print("-" * 60)
print("{:<30} {:<15.2f}% {:<15.2f}%".format('Overall', glove_overall_accuracy, word2vec_overall_accuracy))

Running analogy test with GloVe...
running analogy test for capital-world


100%|██████████| 1000/1000 [02:57<00:00,  5.63it/s]


running analogy test for currency


100%|██████████| 866/866 [02:52<00:00,  5.01it/s]


running analogy test for city-in-state


100%|██████████| 1000/1000 [03:08<00:00,  5.30it/s]


running analogy test for family


100%|██████████| 506/506 [01:37<00:00,  5.18it/s]


running analogy test for gram1-adjective-to-adverb


100%|██████████| 992/992 [03:06<00:00,  5.31it/s]


running analogy test for gram2-opposite


100%|██████████| 812/812 [02:43<00:00,  4.97it/s]


running analogy test for gram3-comparative


100%|██████████| 1000/1000 [03:24<00:00,  4.88it/s]


running analogy test for gram6-nationality-adjective


100%|██████████| 1000/1000 [03:19<00:00,  5.01it/s]



--- Accuracy per Relation ---
capital-world: Accuracy = 86.50% (865/1000)
currency: Accuracy = 14.55% (126/866)
city-in-state: Accuracy = 28.00% (280/1000)
family: Accuracy = 80.04% (405/506)
gram1-adjective-to-adverb: Accuracy = 23.59% (234/992)
gram2-opposite: Accuracy = 20.20% (164/812)
gram3-comparative: Accuracy = 78.70% (787/1000)
gram6-nationality-adjective: Accuracy = 87.40% (874/1000)

Overall Accuracy: 52.05% (3735/7176)
Running analogy test with Word2Vec...
running analogy test for capital-world


100%|██████████| 1000/1000 [05:52<00:00,  2.84it/s]


running analogy test for currency


100%|██████████| 866/866 [11:52<00:00,  1.21it/s]


running analogy test for city-in-state


100%|██████████| 1000/1000 [12:07<00:00,  1.37it/s]


running analogy test for family


100%|██████████| 506/506 [06:57<00:00,  1.21it/s]


running analogy test for gram1-adjective-to-adverb


100%|██████████| 992/992 [13:37<00:00,  1.21it/s]


running analogy test for gram2-opposite


100%|██████████| 812/812 [11:09<00:00,  1.21it/s]


running analogy test for gram3-comparative


100%|██████████| 1000/1000 [14:01<00:00,  1.19it/s]


running analogy test for gram6-nationality-adjective


100%|██████████| 1000/1000 [15:40<00:00,  1.06it/s] 


--- Accuracy per Relation ---
capital-world: Accuracy = 18.20% (182/1000)
currency: Accuracy = 36.72% (318/866)
city-in-state: Accuracy = 23.60% (236/1000)
family: Accuracy = 91.11% (461/506)
gram1-adjective-to-adverb: Accuracy = 41.03% (407/992)
gram2-opposite: Accuracy = 56.40% (458/812)
gram3-comparative: Accuracy = 91.60% (916/1000)
gram6-nationality-adjective: Accuracy = 79.80% (798/1000)

Overall Accuracy: 52.62% (3776/7176)

--- Accuracy Comparison Table ---
Relation                       GloVe Accuracy  Word2Vec Accuracy
------------------------------------------------------------
capital-world                  86.50          % 18.20          %
currency                       14.55          % 36.72          %
city-in-state                  28.00          % 23.60          %
family                         80.04          % 91.11          %
gram1-adjective-to-adverb      23.59          % 41.03          %
gram2-opposite                 20.20          % 56.40          %
gram3-compara




In [6]:
from scipy.spatial.distance import cosine

# function to get top 10 similar words to the given word
def get_top_similar_words(embeddings, target_word, top_n=10):
    if target_word not in embeddings:
        return []

    target_vector = embeddings[target_word]  # Get the actual word vector
    similarities = []
    
    for word in embeddings.key_to_index:  # Iterate over vocabulary
        if word != target_word:
            similarity = 1 - cosine(target_vector, embeddings[word])
            similarities.append((word, similarity))

    similarities.sort(key=lambda item: item[1], reverse=True)  # Sort by similarity descending
    return [word for word, similarity in similarities[:top_n]]

# check top simial for all of these words
verbs_to_test = ["increase", "enter", "start", "build"]
for verb in verbs_to_test:
    top_similar = get_top_similar_words(glove, verb)
    print(f"\nTop 10 words similar to '{verb}' (glove): {', '.join(top_similar)}")
    top_similar = get_top_similar_words(word2vec, verb)
    print(f"\nTop 10 words similar to '{verb}' (word2vec): {', '.join(top_similar)}")


Top 10 words similar to 'increase' (glove): increased, increases, decrease, increasing, reduce, rise, reduced, growth, reduction, boost

Top 10 words similar to 'increase' (word2vec): decrease, reduce, increases, increased, increasing, diminish, enhance, lessen, decreasing, decreases

Top 10 words similar to 'enter' (glove): entering, leave, join, go, able, return, entered, allow, take, participate

Top 10 words similar to 'enter' (word2vec): re-enter, reenter, entering, entered, join, enters, leave, obtain, reach, submit

Top 10 words similar to 'start' (glove): starting, coming, next, begin, going, end, started, before, time, beginning

Top 10 words similar to 'start' (word2vec): begin, starting, beginning, finish, end, starts, go, stop, commence, started

Top 10 words similar to 'build' (glove): develop, construct, expand, create, establish, building, maintain, provide, built, plans

Top 10 words similar to 'build' (word2vec): develop, construct, create, rebuild, maintain, establis

In [7]:
# create and test 2 new analogies
def test_on_new_analogies(embeddings, new_analogy_questions):
    correct_count = 0
    total_count = 0
    for question_type, questions in new_analogy_questions.items():
        print(f"\n--- Testing on New Analogy Type: {question_type} ---")
        for a, b, c, d_true in questions:
            predicted_d = predict_analogy(embeddings, a, b, c)
            print(f"Question: {a} is to {b} as {c} is to ?  True answer: {d_true}, Predicted: {predicted_d}")
            if predicted_d == d_true:
                correct_count += 1
            total_count += 1

    accuracy = (correct_count / total_count) * 100 if total_count > 0 else 0.0
    print(f"\nAccuracy on New Analogies: {accuracy:.2f}% ({correct_count}/{total_count})")
    return accuracy

new_analogy_questions = {
    # analogy to test what happens as an outcome of something
    "reason-effect": [
        ("rain", "flood", "fire", "burn"),
        ("study", "success", "exercise", "fitness"),
        ("virus", "illness", "bacteria", "infection"),
    ],
    # analogy to test animal to baby name mapping
    "animal-baby": [
        ("lion", "cub", "dog", "puppy"),
        ("cat", "kitten", "elephant", "calf"),
        ("walrus", "calf", "horse", "foal"),
    ]
}

print("\n--- Testing GloVe on New Analogies ---")
glove_new_analogy_accuracy = test_on_new_analogies(glove, new_analogy_questions)
print("\n--- Testing Word2Vec on New Analogies ---")
word2vec_new_analogy_accuracy = test_on_new_analogies(word2vec, new_analogy_questions)


--- Testing GloVe on New Analogies ---

--- Testing on New Analogy Type: reason-effect ---
Question: rain is to flood as fire is to ?  True answer: burn, Predicted: building
Question: study is to success as exercise is to ?  True answer: fitness, Predicted: strength
Question: virus is to illness as bacteria is to ?  True answer: infection, Predicted: illnesses

--- Testing on New Analogy Type: animal-baby ---
Question: lion is to cub as dog is to ?  True answer: puppy, Predicted: puppy
Question: cat is to kitten as elephant is to ?  True answer: calf, Predicted: rhinoceros
Question: walrus is to calf as horse is to ?  True answer: foal, Predicted: foot

Accuracy on New Analogies: 16.67% (1/6)

--- Testing Word2Vec on New Analogies ---

--- Testing on New Analogy Type: reason-effect ---
Question: rain is to flood as fire is to ?  True answer: burn, Predicted: fires
Question: study is to success as exercise is to ?  True answer: fitness, Predicted: exercising
Question: virus is to illne