In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
from pyspark.sql import functions as F, Window
from manga_recsys.spark import get_spark

spark = get_spark(memory="2g")

In [2]:
manga = spark.read.parquet("../data/processed/2022-12-10-mangadex-manga.parquet")

In [3]:
manga_name_lang = manga.select(
    F.col("id").alias("manga_id"),
    F.explode("attributes.title").alias("lang", "manga_name"),
)

manga_description_lang = manga.select(
    F.col("id").alias("manga_id"),
    F.explode("attributes.description").alias("lang", "manga_description"),
)

lang_ordered = (
    manga_name_lang.groupBy("lang")
    .count()
    .orderBy(F.desc("count"))
    .withColumn("rank", F.row_number().over(Window.orderBy(F.desc("count"))))
)

# take the language for each manga that has the lowest rank
manga_name_description = (
    manga_name_lang.join(manga_description_lang, ["manga_id", "lang"])
    .join(lang_ordered, "lang")
    .withColumn(
        "manga_lang_rank",
        F.row_number().over(Window.partitionBy("manga_id").orderBy("rank")),
    )
    .filter(F.col("manga_lang_rank") == 1)
    .select("manga_id", "manga_name", "manga_description", "lang")
)

manga_name_description.select("manga_id", "manga_name").show(10, False)

+------------------------------------+-----------------------------------------------+
|manga_id                            |manga_name                                     |
+------------------------------------+-----------------------------------------------+
|0003d7e8-31da-463b-97e9-e87045f8f5ee|Persona 3 - AraHamu Manga (Doujinshi)          |
|0011545f-6560-481e-a53a-972690fb3695|Wagging the Tail                               |
|00148825-e802-456c-8cfd-e10ab05d58c6|Pretty Guardian Sailor Moon Short Stories      |
|0018c989-4361-477d-89e8-29e269be93b7|Arknights: A1 Operations Preparation Detachment|
|001c5f74-c18f-4ab5-9cd6-a46455932934|Anjuu no Chi                                   |
|001f4bb7-d544-4e5a-8efc-2abc012f4e09|Futago, Futagokoro.                            |
|00204d60-d5c8-486d-9dbc-868a91d27612|Furimuku Dake de Anata Tsumi na Otoko          |
|0020845f-63e2-45bd-b69d-5e958aa37dff|Nonbeing                                       |
|002a8f9e-b2e1-4841-8745-bd3d66f252e6|Mamot

In [4]:
manga_name_description.where("manga_name is null").count()

0

In [21]:
manga_info_slim = spark.read.parquet(
    "../data/processed/2022-12-17-metadata-listing/manga_info_slim.parquet"
)
manga_info_slim.show()

+--------------------+--------------------+--------------------+-------------+----------+
|                  id|                name|           createdAt|chapter_count|page_count|
+--------------------+--------------------+--------------------+-------------+----------+
|005b434a-f5ba-49c...|Uwate na Otoko no...|2018-07-13T13:17:...|            8|       185|
|0088c8ad-eb0f-44f...|Harry Potter - Pi...|2019-10-07T18:55:...|            1|        39|
|009b6788-48f3-4e7...|Urami Koi, Koi, U...|2018-09-27T23:31:...|           63|      2823|
|00e3ccc4-0ec3-4fd...|                null|2021-02-15T19:43:...|         null|      null|
|0103c05a-187a-45d...|       Dana's Garden|2018-11-07T04:07:...|            1|        36|
|010a2252-bdc9-4cd...|     Kare no Shouzou|2020-01-20T22:50:...|            5|       187|
|010abcb5-c705-4dc...|Gyaru Gamer ni Ho...|2021-01-02T09:48:...|           33|       329|
|0111ec91-d174-43e...|         Neung Neung|2018-03-28T05:40:...|            2|        36|
|014cc30b-

In [22]:
manga_info_slim.where("name is null").count()

25415

In [3]:
# after fixing the name
manga_info_slim = spark.read.parquet(
    "../data/processed/2022-12-17-metadata-listing/manga_info_slim.parquet"
)
manga_info_slim.where("name is null").count()

0