importing our libs

In [None]:
import numpy as np

# --- load data ---

In [None]:
EMB = {}
with open('glove.6B.50d.txt', "r", encoding="utf8") as f:
    for i, line in enumerate(f):
        parts = line.rstrip().split(" ")
        token = parts[0]
        vals = np.array([float(x) for x in parts[1:]], dtype=float)
        if vals.shape[0] != 50:
            continue
        EMB[token] = vals


In [None]:
def simple_tokenize(text):
    # lowercase, split on spaces and remove basic punctuation
    tokens = text.lower().replace(".", "").replace(",", "").split()
    return tokens


In [None]:

def sentence_to_vector(sentence, emb_index, dim=4):
    tokens = simple_tokenize(sentence)
    vecs = []
    unknown = []
    for t in tokens:
        if t in emb_index:
            vecs.append(emb_index[t])
        else:
            unknown.append(t)
    if len(vecs) == 0:
        # if all words are unknown, return zero vector
        return np.zeros(dim), tokens, unknown
    sent_vec = np.mean(vecs, axis=0)
    return sent_vec, tokens, unknown


In [None]:
def cosine_similarity(a, b):
    # handle zero vectors for div by 0
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))

In [None]:
# test sentence pairs
pairs = [
    ("i love cats", "i like dogs"),
    ("i love cats", "i drive a car"),
    ("he is a king", "she is a queen"),
    ("python is a snake", "python is a programming language"),
    ("bananas are yellow", "i love cats")
]

for s1, s2 in pairs:
    v1, t1, unk1 = sentence_to_vector(s1, EMB)
    v2, t2, unk2 = sentence_to_vector(s2, EMB)
    sim = cosine_similarity(v1, v2)
    print(f"Sentence 1: {s1!r}")
    print(f"Sentence 2: {s2!r}")
    print(f"Cosine similarity: {sim:.4f}\n")
    
print("=== End of demo ===\n")

