In [66]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
from pyspark.sql import functions as F, Window
from manga_recsys.spark import get_spark

spark = get_spark()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [67]:
manga_info = spark.read.parquet(
    "../data/processed/2022-12-17-metadata-listing/manga_info.parquet"
)

In [68]:
tags = manga_info.select(F.explode("tags").alias("tags")).select("tags.*")
n = tags.count()
tags = (
    tags.groupBy("group", "name")
    .count()
    .withColumn("support", F.col("count") / n)
    .orderBy("support", ascending=False)
    .drop("count")
)
tags.show(n=3)

+-----+-------+-------------------+
|group|   name|            support|
+-----+-------+-------------------+
|genre|Romance|0.08855750734636077|
|genre| Comedy|0.07681018786796096|
|genre|  Drama|0.06728134933016963|
+-----+-------+-------------------+
only showing top 3 rows



In [75]:
from umap import UMAP
import numpy as np
from pathlib import Path
import pandas as pd
import json


def process(tags, w2v, output_metric="euclidean"):
    w2v = w2v.reset_index()
    reducer = UMAP(
        n_components=1, metric="cosine", output_metric=output_metric, verbose=True
    )
    emb = reducer.fit_transform(np.stack(w2v.emb.tolist()))
    w2v["emb"] = emb[:, 0].tolist()
    w2v = w2v.sort_values("emb")
    tags_emb = tags.join(
        spark.createDataFrame(w2v).withColumnRenamed("index", "name"),
        on="name",
        how="left",
    ).orderBy("emb")
    return tags_emb

In [76]:
w2v = pd.read_json(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/word2vec-16d.json"
)
tags_emb = process(tags, w2v, output_metric="euclidean")
path = Path(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/tags-euclidean.json"
)
tags_emb.toPandas().to_json(path, orient="records")

UMAP(angular_rp_forest=True, metric='cosine', n_components=1, verbose=True)
Fri Jan  6 20:36:21 2023 Construct fuzzy simplicial set
Fri Jan  6 20:36:21 2023 Finding Nearest Neighbors
Fri Jan  6 20:36:21 2023 Finished Nearest Neighbor Search
Fri Jan  6 20:36:21 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Jan  6 20:36:23 2023 Finished embedding


  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [77]:
tags_emb = process(tags, w2v, output_metric="hyperboloid")
path = Path(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/tags-hyperboloid.json"
)
tags_emb.toPandas().to_json(path, orient="records")

UMAP(angular_rp_forest=True, metric='cosine', n_components=1, output_metric='hyperboloid', verbose=True)
Fri Jan  6 20:36:26 2023 Construct fuzzy simplicial set
Fri Jan  6 20:36:26 2023 Finding Nearest Neighbors
Fri Jan  6 20:36:26 2023 Finished Nearest Neighbor Search
Fri Jan  6 20:36:26 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Jan  6 20:36:27 2023 Finished embedding


  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
