In [None]:
from gensim.models import KeyedVectors
import pandas as pd
import pickle
from collections import Counter

search_type = "masters"
algo = "Word2Vec"
kv = KeyedVectors.load(
    f"./embedding_data/{algo}/embedding_{search_type}.kv"
)

with open(f"./embedding_data/max_genre_counter_{search_type}.pkl", "rb") as f:
    max_genre_counter = pickle.load(f)

style_to_main_genre = {
    st: cnt.most_common(1)[0][0]
    for st, cnt in max_genre_counter.items()
}

main_genre_cnt = Counter()

for style, main_genre in style_to_main_genre.items():
    main_genre_cnt[main_genre] += 1


### 2D

In [5]:
emb_df = pd.DataFrame(kv.vectors, index=kv.index_to_key)
emb_df.reset_index(inplace=True)
emb_df.rename(columns={"index":"style"}, inplace=True)
emb_df

import umap.umap_ as umap

reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="cosine",
    random_state=42
)

coords = reducer.fit_transform(emb_df.drop(columns=["style"]))

emb_df['x'] = coords[:,0]
emb_df['y'] = coords[:,1]

# import hdbscan

# X = emb_df[["x", "y"]]

# clusterer = hdbscan.HDBSCAN(
#     min_cluster_size=15,
#     min_samples=5,
#     metric="euclidean"
# )

# emb_df["cluster"] = clusterer.fit_predict(X)

# emb_df[['style', 'x','y', 'cluster']]

emb_df['Main Genre'] = [style_to_main_genre[style].replace("_", " ") for style in emb_df["style"]]

import plotly.express as px

fig = px.scatter(
    emb_df,
    x="x",
    y="y",
    hover_name="style",
    # color="cluster"
    color="Main Genre",
    color_discrete_sequence=px.colors.qualitative.Dark24
)

fig.update_traces(
    marker=dict(size=6, opacity=0.8)
)

fig.write_html(
    f"./docs/style_{algo}_{search_type}_umap.html", include_plotlyjs="cdn"
)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



### 3D

In [6]:
emb_df = pd.DataFrame(kv.vectors, index=kv.index_to_key)
emb_df.reset_index(inplace=True)
emb_df.rename(columns={"index":"style"}, inplace=True)
emb_df

import umap.umap_ as umap

reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=3,
    metric="cosine",
    random_state=42
)

coords = reducer.fit_transform(emb_df.drop(columns=["style"]))

emb_df['x'] = coords[:,0]
emb_df['y'] = coords[:,1]
emb_df['z'] = coords[:,2]

# import hdbscan

# X = emb_df[["x", "y", "z"]]

# clusterer = hdbscan.HDBSCAN(
#     min_cluster_size=30,
#     min_samples=5,
#     metric="euclidean"
# )

# emb_df["cluster"] = clusterer.fit_predict(X)


emb_df['Main Genre'] = [style_to_main_genre[style].replace("_", " ") for style in emb_df["style"]]

import plotly.express as px

fig = px.scatter_3d(
    emb_df,
    x="x",
    y="y",
    z="z",
    hover_name="style",
    # color="cluster"
    color="Main Genre",
    color_discrete_sequence=px.colors.qualitative.Dark24
)

fig.update_traces(
    marker=dict(size=6, opacity=0.8)
)

fig.write_html(
    f"./docs/style_{algo}_{search_type}_umap_3d.html", include_plotlyjs="cdn"
)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

