In [3]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
from pyspark.sql import functions as F, Window
from manga_recsys.spark import get_spark

spark = get_spark()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [5]:
manga_info = spark.read.parquet(
    "../data/processed/2022-12-17-metadata-listing/manga_info.parquet"
)

In [42]:
from pyspark.ml.fpm import FPGrowth

fp = FPGrowth(
    itemsCol="tags",
    predictionCol="prediction",
    minSupport=0.001,
    minConfidence=0.001,
)
fpm = fp.fit(manga_info)
fpm

FPGrowthModel: uid=FPGrowth_cf676a1c8919, numTrainingRecords=64464

In [43]:
fpm.freqItemsets.count()

118100

In [44]:
fpm.freqItemsets.sort("items").show(n=10, truncate=60)

+------------------------------------------------------------+----+
|                                                       items|freq|
+------------------------------------------------------------+----+
|                                           [{content, Gore}]|1783|
|               [{content, Gore}, {content, Sexual Violence}]| 398|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...| 243|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...|  89|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...| 180|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...|  69|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...|  94|
|[{content, Gore}, {content, Sexual Violence}, {genre, Act...|  69|
|[{content, Gore}, {content, Sexual Violence}, {genre, Adv...| 136|
|[{content, Gore}, {content, Sexual Violence}, {genre, Adv...| 129|
+------------------------------------------------------------+----+
only showing top 10 rows



In [45]:
fpm.freqItemsets.printSchema()

root
 |-- items: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- group: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- freq: long (nullable = false)



In [46]:
exploded = (
    fpm.freqItemsets.where(F.size("items") == 2)
    .withColumn("id", F.monotonically_increasing_id())
    .withColumn("exploded", F.explode("items"))
    .select("id", "exploded.*", "freq", "items")
)
genres = exploded.where(F.col("group") == "genre").select("name").distinct()

# only keep ids that have at least one instance of genre, but not "format" or "content"
valid_ids = (
    exploded.groupBy("id")
    .agg(F.collect_set("group").alias("groups"))
    .where(F.array_contains("groups", "genre"))
    .where(~F.array_contains("groups", "format"))
    .where(~F.array_contains("groups", "content"))
    .select(
        "id",
        F.when(F.array_contains("groups", "theme"), 1).otherwise(0).alias("is_theme"),
    )
    .distinct()
)

filtered = exploded.join(valid_ids, on="id", how="inner")

ranked = (
    genres.join(filtered, on="name", how="left")
    .withColumn(
        "rank",
        F.row_number().over(
            Window.partitionBy("name", "is_theme").orderBy(F.desc("freq"))
        ),
    )
    .where("rank = 1")
)

res = (
    ranked.withColumn("exploded", F.explode("items"))
    .where("name != exploded.name")
    .selectExpr("name as genre", "exploded.name as pair", "is_theme", "freq")
)
print(res.count())

50


In [49]:
res.toPandas().to_csv("../data/processed/2023-01-18-genre-pairs.csv", index=False)

In [51]:
res.show(n=100, truncate=80)

+-------------+-------------+--------+-----+
|        genre|         pair|is_theme| freq|
+-------------+-------------+--------+-----+
|       Action|    Adventure|       0| 5257|
|       Action| Supernatural|       1| 3447|
|    Adventure|       Action|       0| 5257|
|    Adventure| Supernatural|       1| 1871|
|   Boys' Love|      Romance|       0| 5084|
|   Boys' Love|  School Life|       1| 2220|
|       Comedy|      Romance|       0|11086|
|       Comedy|  School Life|       1| 7067|
|        Crime|        Drama|       0|  804|
|        Crime|       Police|       1|  401|
|        Drama|      Romance|       0|10486|
|        Drama|  School Life|       1| 4806|
|      Fantasy|       Action|       0| 5190|
|      Fantasy| Supernatural|       1| 3334|
|  Girls' Love|      Romance|       0| 1515|
|  Girls' Love|  School Life|       1| 1290|
|   Historical|        Drama|       0| 2018|
|   Historical| Supernatural|       1|  807|
|       Horror|        Drama|       0| 1358|
|       Ho