In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [3]:
from pyspark.sql import functions as F
from manga_recsys.spark import get_spark

spark = get_spark()
spark

In [4]:
df = spark.read.parquet("../data/processed/2022-12-10-mangadex-manga/")
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- relationships: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- related: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- availableTranslatedLanguages: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- chapterNumbersResetOnNewVolume: boolean (nullable = true)
 |    |-- contentRating: string (nullable = true)
 |    |-- createdAt: string (nullable = true)
 |    |-- isLocked: boolean (nullable = true)
 |    |-- lastChapter: string (nullable = true)
 |    |-- lastVolume: string (nullable = true)
 |    |-- latestUploadedChapter: string (nullable = true)
 |    |-- originalLanguage: string (nullable = true)
 |    |-- publicationDemographic: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- status: 

In [7]:
exploded_tags = df.select("id", F.explode("attributes.tags").alias("tag")).select(
    F.col("id").alias("manga_id"),
    F.col("tag.attributes.name.en").alias("tag_name"),
    F.col("tag.id").alias("tag_id"),
)
exploded_tags.show(n=10, truncate=60)

+------------------------------------+----------+------------------------------------+
|                            manga_id|  tag_name|                              tag_id|
+------------------------------------+----------+------------------------------------+
|6b64bfb7-8fff-4633-82e2-340cbb8bc92e|Boys' Love|5920b825-4181-4a17-beeb-9918b0ff7a30|
|6b64bfb7-8fff-4633-82e2-340cbb8bc92e| Doujinshi|b13b2a48-c720-44a9-9c77-39c9979373fb|
|6b64bfb7-8fff-4633-82e2-340cbb8bc92e|Full Color|f5ba408b-0e7a-484d-8d49-4e9125ac96de|
|4660003f-15c9-4b52-84c1-ba46c6943edf|   Oneshot|0234a31e-a729-4e28-9d6a-3f87c4966b9e|
|4660003f-15c9-4b52-84c1-ba46c6943edf|   Romance|423e2eae-a7a2-4a8b-ac03-a8351462d71d|
|4660003f-15c9-4b52-84c1-ba46c6943edf|     Drama|b9af3a63-f058-46de-a9a0-e0c13906197a|
|4660003f-15c9-4b52-84c1-ba46c6943edf|   Tragedy|f8f62932-27da-4fe4-8ee1-6779a8c5edba|
|699f08d0-7661-4f8e-8eb8-9718b3de1a7a|Long Strip|3e2b8dae-350e-4ab8-a8ce-016e844b9f0d|
|699f08d0-7661-4f8e-8eb8-9718b3de1a7a|   Ro

In [8]:
collected_tags = exploded_tags.groupBy("manga_id").agg(
    F.collect_set("tag_name").alias("tag_names")
)
collected_tags.show(n=10, truncate=60)

+------------------------------------+------------------------------------------------------------+
|                            manga_id|                                                   tag_names|
+------------------------------------+------------------------------------------------------------+
|0003d7e8-31da-463b-97e9-e87045f8f5ee|[Slice of Life, Drama, Romance, Oneshot, Doujinshi, Tragedy]|
|001bc08b-21a9-47fa-bfc0-82b0ad77a58b|      [Boys' Love, Drama, Oneshot, Doujinshi, Crossdressing]|
|00204d60-d5c8-486d-9dbc-868a91d27612|                                 [Boys' Love, Comedy, Shota]|
|0020845f-63e2-45bd-b69d-5e958aa37dff|[Thriller, Action, User Created, Horror, Psychological, T...|
|002a8f9e-b2e1-4841-8745-bd3d66f252e6|                    [Fantasy, Comedy, Romance, Supernatural]|
|002d34bf-4514-4f8d-9f60-4219d0de8402|                   [Boys' Love, Drama, School Life, Romance]|
|0030db10-93a6-4f6a-85f8-939db42600f4|[Villainess, Fantasy, Reincarnation, Comedy, Romance, Ada...|


# fpgrowth

This is an algorithm for building association rule sets, where sets of tags are used to predict other sets of tags.

## support 

Support is an indicator of frequency of a set of tags.

$$
supp(X) = \frac{|(i, t) \in T: X \subseteq t|}{|T|}
$$

where $(i, t)$ is the identifier and itemset of a transaction.

## confidence

Confidence is the percentage of times that a set of tags is present when another set of tags is present.

$$
conf(X \rightarrow Y) = \frac{supp(X \cap Y)}{supp(X)}
$$

where $X \rightarrow Y$ is the rule.

## lift

Lift gives a measure of how likely two sets of tags are independent of each other.
When lift is greater than 1, then the sets are dependent.
When lift is less than 1, then the sets are independent.

$$
lift(X \rightarrow Y) = \frac{conf(X \cap Y)}{supp(X) \times supp(Y)}
$$


In [27]:
from pyspark.ml.fpm import FPGrowth

fp = FPGrowth(
    itemsCol="tag_names",
    predictionCol="prediction",
    minSupport=0.01,
    minConfidence=0.01,
)
fpm = fp.fit(collected_tags)
fpm

FPGrowthModel: uid=FPGrowth_446a70da9edd, numTrainingRecords=64464

In [28]:
fpm.freqItemsets.count()

884

In [30]:
fpm.freqItemsets.sort("items").show(n=10, truncate=60)

+--------------------------------+-----+
|                           items| freq|
+--------------------------------+-----+
|                        [4-Koma]| 1277|
|                [4-Koma, Comedy]| 1110|
|         [4-Koma, Slice of Life]|  716|
|                        [Action]|11559|
|                [Action, Comedy]| 4832|
|       [Action, Comedy, Romance]| 1611|
|                 [Action, Drama]| 5115|
|         [Action, Drama, Comedy]| 2114|
|[Action, Drama, Comedy, Romance]|  861|
|        [Action, Drama, Romance]| 1570|
+--------------------------------+-----+
only showing top 10 rows



In [31]:
fpm.associationRules.sort(F.desc("support"), F.desc("confidence")).show()

+---------------+---------------+-------------------+------------------+-------------------+
|     antecedent|     consequent|         confidence|              lift|            support|
+---------------+---------------+-------------------+------------------+-------------------+
|       [Comedy]|      [Romance]| 0.4948886210437034|1.2352383190831804|0.17197195333829735|
|      [Romance]|       [Comedy]|0.42924071707902584|1.2352383190831802|0.17197195333829735|
|        [Drama]|      [Romance]| 0.5344001630822546|1.3338588342794153| 0.1626644328617523|
|      [Romance]|        [Drama]|0.40600921516242694|1.3338588342794153| 0.1626644328617523|
|  [School Life]|      [Romance]| 0.6168492633169625|1.5396511755319888|0.12664432861752295|
|      [Romance]|  [School Life]|0.31610330274518916|1.5396511755319888|0.12664432861752295|
|[Slice of Life]|       [Comedy]| 0.5569262027429069|1.6026825022819853|0.11905869942913874|
|       [Comedy]|[Slice of Life]|0.34261863309673674| 1.60268250228198

In [48]:
fpm.associationRules.where("size(antecedent) > 1").sort(F.desc("support")).show(
    truncate=60
)

+------------------------+---------------+-------------------+------------------+-------------------+
|              antecedent|     consequent|         confidence|              lift|            support|
+------------------------+---------------+-------------------+------------------+-------------------+
| [Long Strip, Web Comic]|   [Full Color]| 0.9281793229643184|  6.38639682736384|0.09442479523454951|
|[Long Strip, Full Color]|    [Web Comic]| 0.8380834365964478| 5.231042859871555|0.09442479523454951|
| [Full Color, Web Comic]|   [Long Strip]|  0.880133024869867|6.2458052966987125|0.09442479523454951|
|       [Comedy, Romance]|  [School Life]|0.41286307053941906|  2.01094106378943|0.07100086870191115|
|   [School Life, Comedy]|      [Romance]|  0.647658129333522|1.6165498760737276|0.07100086870191115|
|  [School Life, Romance]|       [Comedy]| 0.5606320431161196|1.6133469053808995|0.07100086870191115|
|         [Drama, Comedy]|      [Romance]| 0.6082323748357904|1.5181434859416267|0

In [59]:
fpm.associationRules.where(F.array_contains(F.col("consequent"), "Isekai")).sort(
    F.desc("confidence")
).show(truncate=60)

+------------------------+----------+-------------------+------------------+--------------------+
|              antecedent|consequent|         confidence|              lift|             support|
+------------------------+----------+-------------------+------------------+--------------------+
|[Reincarnation, Fantasy]|  [Isekai]| 0.6085240726124704|18.573814307239722|0.011960163812360387|
|         [Reincarnation]|  [Isekai]|  0.525175644028103| 16.02979295294869|0.013914743112434848|
|   [Adaptation, Fantasy]|  [Isekai]| 0.3764002987303958|11.488763663520945| 0.01563663440059568|
|   [Adaptation, Romance]|  [Isekai]|0.29385574354407834|  8.96927871772039|0.010238272524199553|
|        [Magic, Fantasy]|  [Isekai]|  0.282590051457976| 8.625419070637767|0.010222759990071979|
|            [Adaptation]|  [Isekai]|0.25822485207100593| 7.881726734803658|0.016924174733184413|
|                 [Magic]|  [Isekai]|0.24475524475524477| 7.470597584233948|0.010858773889302557|
|      [Fantasy, Rom