In [1]:
pip install numpy


Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np

def load_glove(file_path):
    glove = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector
    return glove

# Load GloVe vectors (make sure the file is in the same folder as the notebook)
glove_vectors = load_glove("glove.6B.50d.txt")  # or another file like glove.6B.50d.txt


In [5]:
# Show the vector for a word
glove_vectors["cat"]


array([ 0.45281 , -0.50108 , -0.53714 , -0.015697,  0.22191 ,  0.54602 ,
       -0.67301 , -0.6891  ,  0.63493 , -0.19726 ,  0.33685 ,  0.7735  ,
        0.90094 ,  0.38488 ,  0.38367 ,  0.2657  , -0.08057 ,  0.61089 ,
       -1.2894  , -0.22313 , -0.61578 ,  0.21697 ,  0.35614 ,  0.44499 ,
        0.60885 , -1.1633  , -1.1579  ,  0.36118 ,  0.10466 , -0.78325 ,
        1.4352  ,  0.18629 , -0.26112 ,  0.83275 , -0.23123 ,  0.32481 ,
        0.14485 , -0.44552 ,  0.33497 , -0.95946 , -0.097479,  0.48138 ,
       -0.43352 ,  0.69455 ,  0.91043 , -0.28173 ,  0.41637 , -1.2609  ,
        0.71278 ,  0.23782 ], dtype=float32)

In [7]:
import numpy as np
from numpy.linalg import norm

In [None]:
with open("glove.6B.50d.txt", "r", encoding="utf-8") as f:
    vectors = f.read()
    vectors = vectors.split("\n")

In [None]:
word2idx = dict()
matrix = np.zeros((len(vectors), 50))

idx = 0
for v in vectors:
    data : list = v.split(" ")
    word : str = data[0]
    vector : list = [float(i) for i in data[1:]]
    if(len(vector) > 1):
        word2idx[word] = idx
        matrix[idx,0:] = vector
        idx += 1

In [9]:
import numpy as np

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def most_similar(A: str):
    if A not in glove_vectors:
        print(f"'{A}' not in vocabulary.")
        return []
    
    A_vec = glove_vectors[A]
    similarities = []

    for word, vec in glove_vectors.items():
        if word != A:
            sim = cosine_similarity(A_vec, vec)
            similarities.append((word, sim))

    # Sort by similarity, descending
    results = sorted(similarities, key=lambda x: x[1], reverse=True)

    return results[:10]  # Top 10 most similar words


In [11]:
import numpy as np

def load_glove(file_path):
    glove = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector
    return glove

# Load 50-dimensional GloVe vectors
glove_vectors = load_glove("glove.6B.50d.txt")


In [13]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def most_similar(A: str):
    if A not in glove_vectors:
        print(f"'{A}' not in vocabulary.")
        return []
    
    A_vec = glove_vectors[A]
    similarities = []

    for word, vec in glove_vectors.items():
        if word != A:
            sim = cosine_similarity(A_vec, vec)
            similarities.append((word, sim))

    # Sort by similarity
    results = sorted(similarities, key=lambda x: x[1], reverse=True)
    return results[:10]


In [15]:
for word, score in most_similar("terrible"):
    print(f"{word}: {score:.4f}")


horrible: 0.9373
awful: 0.8873
tragic: 0.8340
dreadful: 0.8280
tragedy: 0.8180
horrific: 0.8109
nightmare: 0.7995
unfortunate: 0.7938
horrendous: 0.7857
unfortunately: 0.7838


In [17]:
import numpy as np

def cossim(A, B):
    cosine = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))
    return cosine


In [23]:
vec1 = glove_vectors["prague"]
vec2 = glove_vectors["czech"]

similarity = cossim(vec1, vec2)
print(f"Cosine similarity: {similarity:.4f}")


Cosine similarity: 0.6934
