In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
from pyspark.sql import functions as F, Window
from manga_recsys.spark import get_spark

spark = get_spark()

In [2]:
manga_info = spark.read.parquet(
    "../data/processed/2022-12-17-metadata-listing/manga_info.parquet"
)

In [19]:
tags = manga_info.select(F.explode("tags").alias("tags")).distinct().select("tags.*")
tags.show(n=3)

+------+----------------+
| group|            name|
+------+----------------+
| theme|         Samurai|
|format|       Anthology|
|format|Official Colored|
+------+----------------+
only showing top 3 rows



In [29]:
import pandas as pd

w2v = pd.read_json(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/word2vec-1d.json"
)
w2v = w2v.reset_index()
w2v.emb = w2v.emb.apply(lambda x: x[0])
w2v.head()

Unnamed: 0,index,emb
0,4-Koma,-2.064125
1,Action,-2.740936
2,Adaptation,-2.548935
3,Adventure,-2.760301
4,Aliens,-2.783622


In [32]:
tags_emb = tags.join(
    spark.createDataFrame(w2v).withColumnRenamed("index", "name"), on="name", how="left"
).orderBy("emb")
tags_emb.show()

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


+----------------+-------+-------------------+
|            name|  group|                emb|
+----------------+-------+-------------------+
|        Military|  theme|  -3.32401180267334|
|        Survival|  theme| -3.247955799102783|
|           Crime|  genre| -3.142381191253662|
|            Gore|content| -3.104047536849975|
|        Thriller|  genre| -3.062736988067627|
|          Police|  theme| -3.021719217300415|
|        Monsters|  theme| -3.021246910095215|
|           Mafia|  theme| -3.018636465072632|
|   Philosophical|  genre|  -2.98616623878479|
|Post-Apocalyptic|  theme| -2.985933303833008|
|         Samurai|  theme| -2.943522691726684|
|          Sci-Fi|  genre| -2.930526494979858|
|          Aliens|  theme| -2.783621788024902|
|       Adventure|  genre| -2.760301351547241|
|          Demons|  theme|  -2.74478793144226|
|         Zombies|  theme| -2.741482019424438|
|          Action|  genre| -2.740936040878296|
|     Delinquents|  theme| -2.731619834899902|
|   Reincarna

In [34]:
from pathlib import Path
import json

path = Path("../data/processed/2022-12-20-recommendation-manga-tags-word2vec/tags.json")
tags_emb.toPandas().to_json(path, orient="records")

In [35]:
# let's actually embed the tags into hyperbolic space so it looks good on a circle

In [37]:
w2v = pd.read_json(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/word2vec-16d.json"
)
w2v = w2v.reset_index()
w2v.head()

Unnamed: 0,index,emb
0,4-Koma,"[0.043435163795948, 0.246976390480995, -0.5715..."
1,Action,"[0.917387902736663, 1.068451046943664, 2.03778..."
2,Adaptation,"[0.9329831004142761, -1.450610756874084, -0.79..."
3,Adventure,"[2.34206485748291, -0.047535207122564004, 0.96..."
4,Aliens,"[2.270046234130859, 0.06459613144397701, 1.549..."


In [40]:
from umap import UMAP
import numpy as np

reducer = UMAP(n_components=1, output_metric="hyperboloid", verbose=True)
emb = reducer.fit_transform(np.stack(w2v.emb.tolist()))

UMAP(n_components=1, output_metric='hyperboloid', verbose=True)
Fri Jan  6 20:03:40 2023 Construct fuzzy simplicial set
Fri Jan  6 20:03:40 2023 Finding Nearest Neighbors
Fri Jan  6 20:03:41 2023 Finished Nearest Neighbor Search
Fri Jan  6 20:03:43 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Jan  6 20:03:44 2023 Finished embedding


In [44]:
w2v["emb"] = emb[:, 0].tolist()
w2v = w2v.sort_values("emb")
w2v.head()

Unnamed: 0,index,emb
35,Martial Arts,-88.506248
0,4-Koma,-76.551628
61,Superhero,-69.547234
47,Philosophical,-63.449074
17,Fan Colored,-57.06543


In [45]:
tags_emb = tags.join(
    spark.createDataFrame(w2v).withColumnRenamed("index", "name"), on="name", how="left"
).orderBy("emb")
tags_emb.show()

path = Path(
    "../data/processed/2022-12-20-recommendation-manga-tags-word2vec/tags-hyperboloid.json"
)
tags_emb.toPandas().to_json(path, orient="records")

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


+---------------+-------+-------------------+
|           name|  group|                emb|
+---------------+-------+-------------------+
|   Martial Arts|  theme|  -88.5062484741211|
|         4-Koma| format| -76.55162811279297|
|      Superhero|  genre| -69.54723358154297|
|  Philosophical|  genre|-63.449073791503906|
|    Fan Colored| format|     -57.0654296875|
|          Music|  theme| -52.26902389526367|
|           Gore|content| -47.96864318847656|
|         Ghosts|  theme| -44.20344543457031|
|        Mystery|  genre| -39.89293670654297|
|        Cooking|  theme|-36.308815002441406|
|         Demons|  theme|-32.586727142333984|
|         Aliens|  theme|-29.141666412353516|
|          Mecha|  genre|-26.937532424926758|
|  Crossdressing|  theme|-24.301036834716797|
|  Slice of Life|  genre|-22.155498504638672|
|    Delinquents|  theme|-18.757444381713867|
|          Harem|  theme|-17.142885208129883|
|          Mafia|  theme|-15.103346824645996|
|Virtual Reality|  theme| -13.0152