# Reglas de Asociación

In [1]:
from blackops.utils.catalog import start_spark_session
import pyspark.sql.functions as f
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
spark = start_spark_session()

24/11/03 13:10:43 WARN Utils: Your hostname, dadiego resolves to a loopback address: 127.0.1.1; using 192.168.104.128 instead (on interface eth1)
24/11/03 13:10:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/dadiego/projects/esic-bigdata-iv-blackops/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/dadiego/.ivy2/cache
The jars for the packages stored in: /home/dadiego/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
io.delta#delta-sharing-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-84cae321-20fa-4d41-b16c-4c864d9edb6f;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found io.delta#delta-sharing-spark_2.12;3.2.0 in central
	found io.delta#delta-sharing-client_2.12;1.0.5 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.2 in central
	found commons-codec#commons-codec;1.11 in central
:: resolution report :: resolve 439ms :: artifacts dl 16ms
	:: modules in use:
	commons-codec#commons-codec;1.11 from central in [default]
	commons-logging#commons-log

# Lectura de datos
Nuestro modelo de datos se compone de dos tablas. En la tabla principal (de hechos), hay un registro de transacciones de venta; el campo `transaction_id` identifica de manera única cada transacción, y el campo `product_id` hace referencia a la id del producto que se ha comprado en la transacción correspondiente.

In [3]:
transactions = spark.read.csv(
    "data/store-transactions.csv", header=True, inferSchema=True
)
dim_product = spark.read.csv("data/dim_product.csv", header=True, inferSchema=True)
display(transactions, dim_product)

transaction_id,product_id
1,9
1,3
1,1
2,3
2,9
2,1
2,6
3,3
3,11
3,2


id,product
1,BISCUIT
2,BOURNVITA
3,BREAD
4,COCK
5,COFFEE
6,CORNFLAKES
7,JAM
8,MAGGI
9,MILK
10,SUGER


Unimos ambas tablas para obtener una tabla completa de información

In [4]:
df = (
    transactions.join(
        dim_product, how="inner", on=transactions.product_id == dim_product.id
    )
    .select(transactions.transaction_id, dim_product.product)
    .cache()
)
display(df)

transaction_id,product
1,MILK
1,BREAD
1,BISCUIT
2,BREAD
2,MILK
2,BISCUIT
2,CORNFLAKES
3,BREAD
3,TEA
3,BOURNVITA


Pre-calculamos el número total de transacciones y lo almacenamos en una variable de Python

In [5]:
total_transactions = df.select(f.countDistinct("transaction_id")).collect()[0][0]

Calculamos los soportes de cada artículo

In [6]:
df.groupby("product").agg(
    f.round(
        f.countDistinct("transaction_id") / total_transactions,
        3,
    ).alias("probability")
).sort("probability", ascending=False)

product,probability
BREAD,0.65
COFFEE,0.4
TEA,0.35
BISCUIT,0.35
CORNFLAKES,0.3
SUGER,0.3
MILK,0.25
MAGGI,0.25
BOURNVITA,0.2
COCK,0.15


Transformamos nuestro DataFrame a un formato condensado, con una fila por cada transacción, y una lista de items asociada a cada transacción. Para ello, hacemos uso del comando `groupBy` para agrupar por transacción y luego agregamos los ítems en una lista con la función `f.collect_list`.

In [7]:
df = (
    df.groupBy("transaction_id")
    .agg(f.collect_list("product").alias("products"))
    .sort("transaction_id")
)
display(df)

transaction_id,products
1,"[MILK, BREAD, BISCUIT]"
2,"[BREAD, MILK, BISCUIT, CORNFLAKES]"
3,"[BREAD, TEA, BOURNVITA]"
4,"[JAM, MAGGI, BREAD, MILK]"
5,"[MAGGI, TEA, BISCUIT]"
6,"[BREAD, TEA, BOURNVITA]"
7,"[MAGGI, TEA, CORNFLAKES]"
8,"[MAGGI, BREAD, TEA, BISCUIT]"
9,"[JAM, MAGGI, BREAD, TEA]"
10,"[BREAD, MILK]"


In [8]:
te = TransactionEncoder()
data = te.fit_transform(df.toPandas()["products"]).astype("int8")
df = pd.DataFrame(data, columns=te.columns_)
display(df)

  from distutils.version import LooseVersion


Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


In [9]:
prior = apriori(df, min_support=0.2, use_colnames=True, verbose=1)
display(prior)

Processing 42 combinations | Sampling itemset size 3




Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.4,(COFFEE)
4,0.3,(CORNFLAKES)
5,0.25,(MAGGI)
6,0.25,(MILK)
7,0.3,(SUGER)
8,0.35,(TEA)
9,0.2,"(BISCUIT, BREAD)"


In [10]:
association_rules(prior, metric="confidence", min_threshold=0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
1,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75
