In [2]:
import numpy as np

# ===== Load GloVe =====
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = "glove.6B.100d.txt"  # <-- change path
embeddings = load_glove_embeddings(glove_path)

# ===== Cosine Similarity =====
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# ===== Find Most Similar Words =====
def most_similar(query_vector, embeddings, top_n=10, exclude=[]):
    similarities = {}
    for word, vector in embeddings.items():
        if word in exclude:
            continue
        sim = cosine_similarity(query_vector, vector)
        similarities[word] = sim
    # Sort by similarity score
    sorted_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:top_n]

# ===== Analogy: Queen - Woman + Man =====
def analogy(word_a, word_b, word_c, embeddings, top_n=10):
    if word_a not in embeddings or word_b not in embeddings or word_c not in embeddings:
        raise ValueError("One of the words is not in the vocabulary.")

    vec_a = embeddings[word_a]
    vec_b = embeddings[word_b]
    vec_c = embeddings[word_c]

    # vector_a - vector_b + vector_c
    target_vec = vec_a - vec_b + vec_c
    results = most_similar(target_vec, embeddings, top_n=top_n, exclude=[word_a, word_b, word_c])
    return results

# ===== Run the Analogy =====
results = analogy("queen", "woman", "man", embeddings, top_n=10)
print("Top results for 'Queen - Woman + Man':")
for word, score in results:
    print(f"{word}: {score:.4f}")


Top results for 'Queen - Woman + Man':
king: 0.7941
royal: 0.6839
prince: 0.6763
crown: 0.6331
vi: 0.6275
majesty: 0.6232
princess: 0.6215
lord: 0.6148
palace: 0.6117
great: 0.6030
