In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
from pyspark.sql import functions as F, Window
from manga_recsys.spark import get_spark

spark = get_spark()

In [2]:
group = spark.read.parquet("../data/processed/2022-12-17-mangadex-group.parquet")
manga = spark.read.parquet("../data/processed/2022-12-10-mangadex-manga.parquet")

In [3]:
group_info = group.select(
    "id",
    "attributes.name",
    "attributes.createdAt",
    "attributes.updatedAt",
    "attributes.description",
    "attributes.inactive",
)
group_info.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------+
|                  id|                name|           createdAt|           updatedAt|         description|inactive|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------+
|c6931ee7-b4cd-44d...|       LaSecteDuScan|2021-04-19T21:45:...|2021-04-19T21:45:...|Bonjours,LaSecteD...|   false|
|3eef1981-4ab5-434...|         Alive Scans|2021-04-19T21:45:...|2021-04-19T21:45:...|                null|   false|
|145f9110-0a6c-4b7...|             Unknown|2021-04-19T21:45:...|2022-09-18T16:16:...|This group serves...|   false|
|7f4ea5d0-6af4-48a...| Effortposting Scans|2021-04-19T21:45:...|2021-04-19T21:45:...|              *Ahem*|   false|
|c1a3aadb-8b80-445...|         Saikai Scan|2021-04-19T21:45:...|2021-04-19T21:45:...|                null|   false|
|1d1bf070-9e6d-490...|          SH-Project|2021-04-19T21:45:...|2022-05-

In [4]:
manga_name_lang = manga.select(
    F.col("id").alias("manga_id"),
    F.explode("attributes.title").alias("lang", "manga_name"),
)

manga_description_lang = manga.select(
    F.col("id").alias("manga_id"),
    F.explode("attributes.description").alias("lang", "manga_description"),
)

lang_ordered = (
    manga_name_lang.groupBy("lang")
    .count()
    .orderBy(F.desc("count"))
    .withColumn("rank", F.row_number().over(Window.orderBy(F.desc("count"))))
)

# take the language for each manga that has the lowest rank
manga_name = (
    manga_name_lang.join(manga_description_lang, ["manga_id", "lang"])
    .join(lang_ordered, "lang")
    .withColumn(
        "manga_lang_rank",
        F.row_number().over(Window.partitionBy("manga_id").orderBy("rank")),
    )
    .filter(F.col("manga_lang_rank") == 1)
    .select("manga_id", "manga_name", "manga_description", "lang")
)

manga_name.show()

+--------------------+--------------------+--------------------+----+
|            manga_id|          manga_name|   manga_description|lang|
+--------------------+--------------------+--------------------+----+
|002a8f9e-b2e1-484...|Mamotte Shugogetten!|The story is abou...|  en|
|004f244e-03d0-4ee...|  Nether Cram School|Because the real ...|  en|
|005b434a-f5ba-49c...|Uwate na Otoko no...|In the rain Yuuic...|  en|
|0061c5ed-c160-4d3...|Shuumatsu no Izet...|Pairing: Fine x I...|  en|
|00635b26-aa06-4b3...|Raise A Young Master|The Female doctor...|  en|
|006fa844-b8d0-4fc...|My Girlfriend wil...|The main characte...|  en|
|00714216-e23d-448...|Reincarnated Esco...|My dream is to be...|  en|
|0088c8ad-eb0f-44f...|Harry Potter - Pi...|     Strip Ping Pong|  en|
|008c1aec-fbd9-41b...|Sweet Pain (Minat...|Yuuto and I are c...|  en|
|009b6788-48f3-4e7...|Urami Koi, Koi, U...|The Twelve City, ...|  en|
|00a32677-20ff-4f4...|About a Webtoon W...|One day, writer P...|  en|
|00a9a2ba-89a8-479..

In [5]:
# get all tags for each manga, we purposely omit the tag id
# since all tags are unique anyways
manga_tags = (
    manga.select("id", F.explode("attributes.tags").alias("tag"))
    .select(
        "id",
        F.struct(
            "tag.attributes.group",
            F.col("tag.attributes.name.en").alias("name"),
        ).alias("tag"),
    )
    .groupBy("id")
    .agg(F.collect_list("tag").alias("tags"))
)
manga_tags.show(truncate=60)

+------------------------------------+------------------------------------------------------------+
|                                  id|                                                        tags|
+------------------------------------+------------------------------------------------------------+
|001bc08b-21a9-47fa-bfc0-82b0ad77a58b|[{format, Oneshot}, {genre, Boys' Love}, {theme, Crossdre...|
|00204d60-d5c8-486d-9dbc-868a91d27612|      [{genre, Comedy}, {genre, Boys' Love}, {theme, Shota}]|
|0020845f-63e2-45bd-b69d-5e958aa37dff|[{genre, Thriller}, {genre, Sci-Fi}, {genre, Action}, {ge...|
|002a8f9e-b2e1-4841-8745-bd3d66f252e6|[{genre, Romance}, {genre, Comedy}, {genre, Fantasy}, {th...|
|003e1cfd-b4f9-4d66-9080-34455ac19a96|[{genre, Historical}, {genre, Romance}, {genre, Drama}, {...|
|003e7fbf-f047-4783-a7df-1533a2a653d4|[{format, Award Winning}, {genre, Psychological}, {genre,...|
|004f244e-03d0-4ee0-ba0e-d2d2f8e6cdbf|[{genre, Thriller}, {format, Long Strip}, {theme, School ...|


In [6]:
manga_info = (
    manga.select(
        "id",
        "attributes.createdAt",
        "attributes.updatedAt",
        "attributes.availableTranslatedLanguages",
        "attributes.originalLanguage",
        "attributes.publicationDemographic",
        "attributes.status",
        "attributes.year",
        "attributes.contentRating",
    )
    .join(
        manga_name.select(
            F.col("manga_id").alias("id"), F.col("manga_name").alias("name")
        ),
        "id",
    )
    .join(manga_tags, "id")
)
manga_info.show(vertical=True, truncate=False, n=3)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------
 id                           | 002a8f9e-b2e1-4841-8745-bd3d66f252e6                                                                                              
 createdAt                    | 2019-06-16T19:41:38+00:00                                                                                                         
 updatedAt                    | 2019-06-16T19:41:38+00:00                                                                                                         
 availableTranslatedLanguages | []                                                                                                                                
 originalLanguage             | ja                                                                                                                                
 publicationDemographi

In [11]:
manga_info.limit(10).collect()[0].asDict(recursive=True)

{'id': '002a8f9e-b2e1-4841-8745-bd3d66f252e6',
 'createdAt': '2019-06-16T19:41:38+00:00',
 'updatedAt': '2019-06-16T19:41:38+00:00',
 'availableTranslatedLanguages': [],
 'originalLanguage': 'ja',
 'publicationDemographic': 'shounen',
 'status': 'completed',
 'year': None,
 'contentRating': 'safe',
 'name': 'Mamotte Shugogetten!',
 'tags': [{'group': 'genre', 'name': 'Romance'},
  {'group': 'genre', 'name': 'Comedy'},
  {'group': 'genre', 'name': 'Fantasy'},
  {'group': 'theme', 'name': 'Supernatural'}]}