In [None]:
# import stuff
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
import plotly.express as px
import plotly, tqdm

words_full = ["king", "queen", "man", "woman", 'happy', 'sad', 'joyful', 'nice', 'apple', 'banana', 'cherry', 'strawberry', 'fork', 'spoon', 'knife', "cutlery", 'moon', 'sun', "planet", "star"]
model = api.load('word2vec-google-news-300')
word_embeddings_full = {w : model[w] for w in words_full}

In [None]:
word_embeddings_full["king"].shape

In [None]:
# example cosine similarity with 3 simple vectors
v_100 = np.array([1, 0, 0]).reshape(1, -1)
v_m100 = np.array([-1, 0, 0]).reshape(1, -1)
v_010 = np.array([0, 1, 0]).reshape(1, -1)
v_200 = np.array([2, 0, 0]).reshape(1, -1)
print(f"SIMILARITY BETWEEN 1 0 0 AND 1 0 0: {cosine_similarity(v_100, v_100)}")
print(f"SIMILARITY BETWEEN 1 0 0 AND 0 1 0: {cosine_similarity(v_100, v_010)}")
print(f"SIMILARITY BETWEEN 1 0 0 AND 2 0 0: {cosine_similarity(v_100, v_200)}")
print(f"SIMILARITY BETWEEN 1 0 0 AND -1 0 0: {cosine_similarity(v_100, v_m100)}")

In [None]:
# example cosinbe similarity with embeddings
v_q = word_embeddings_full["queen"].reshape(1, -1)
v_k = word_embeddings_full["king"].reshape(1, -1)
v_m = word_embeddings_full["man"].reshape(1, -1)
v_w = word_embeddings_full["woman"].reshape(1, -1)
v_s = word_embeddings_full["strawberry"].reshape(1, -1)
v_a = word_embeddings_full["apple"].reshape(1, -1)
print(f"SIMILARITY BETWEEN MAN AND WOMAN: {cosine_similarity(v_m, v_w)}")
print(f"SIMILARITY BETWEEN KING AND QUEEN: {cosine_similarity(v_k, v_q)}")
print(f"SIMILARITY BETWEEN (KING - MAN + WOMAN) AND QUEEN: {cosine_similarity(v_k-v_m + v_w, v_q)}")
print(f"SIMILARITY BETWEEN KING AND STRAWBERRY: {cosine_similarity(v_k, v_s)}")
print(f"SIMILARITY BETWEEN APPLE AND STRAWBERRY: {cosine_similarity(v_a, v_s)}")

In [None]:
# visualize similarity of embeddings in heatmap

words = words_full[4:]
word_embeddings = {k:v for k, v in word_embeddings_full.items() if k in words}

embedding_matrix = np.array(list(word_embeddings.values()))
embedding_matrix.shape
similarity_matrix = cosine_similarity(embedding_matrix)

plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap='viridis', cbar=True
            , xticklabels=words, yticklabels=words)
plt.xlabel('Sample Index')
plt.ylabel('Sample Index')
plt.title('Similarity Matrix with Color Coding')
plt.show()



In [None]:
# Visualize embeddings in 2 dimensions using PCA
embedding_matrix.shape
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(embedding_matrix)
reduced_data

# Define colors
colors = ['blue'] * 4 + ['red'] * 4 + ['green'] * 4 + ['cyan'] * 4
colors = colors[:len(reduced_data)]

# Extract X and Y coordinates
x_values = [point[0] for point in reduced_data]
y_values = [point[1] for point in reduced_data]

# Create a 2D scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(x_values, y_values, color=colors, s = 150)
plt.xticks([])
plt.yticks([])
plt.show()


In [None]:
# Visualize embeddings in 3 dimensions using PCA
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(embedding_matrix)

# Extract X, Y, and Z coordinates
x_values = [point[0] for point in reduced_data]
y_values = [point[1] for point in reduced_data]
z_values = [point[2] for point in reduced_data]

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')  # Add a 3D subplot
ax.scatter(x_values, y_values, z_values, color=colors, s=100)
ax.set_xticks([])
ax.set_yticks([])
ax.set_zticks([])

plt.show()


In [None]:
silhouette_scores = []
kmeans_labels = {}
for k in tqdm.tqdm(range(2, 12)):
    kmeans = KMeans(n_clusters=k, 
                    random_state=42, 
                    n_init = 'auto').fit(embedding_matrix)
    kmeans_labels[k] = kmeans.labels_
    silhouette_scores.append(
        {
            'k': k,
            'silhouette_score': silhouette_score(embedding_matrix, 
                kmeans_labels[k], metric = 'cosine')
        }
    )

for e in zip(kmeans_labels[4], words):
    print(e)

fig = px.line(pd.DataFrame(silhouette_scores).set_index('k'),
       title = '<b>Silhouette scores for K-means clustering</b>',
       labels = {'value': 'silhoutte score'}, 
       color_discrete_sequence = plotly.colors.qualitative.Alphabet)
fig.update_layout(showlegend = False)

