In [1]:
# Installer PySpark
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F

In [3]:
# Initialiser SparkSession
spark = SparkSession.builder.appName("TD2").getOrCreate()

# Exercice 1

## 1 - Loader le Dataset des transactions

In [4]:
df = spark.read.csv("data td1.csv", header=True, inferSchema=True)

In [5]:
df.limit(10).toPandas()

Unnamed: 0,transaction_id,customer_id,product_id,product_categorie,boutique,amount,date
0,1,1662,752,Catégorie_50,Boutique_1,50.96,2024-01-14
1,2,2722,386,Catégorie_37,Boutique_21,182.74,2024-02-07
2,3,2298,809,Catégorie_89,Boutique_30,217.6,2023-12-06
3,4,1398,953,Catégorie_33,Boutique_41,401.24,2024-06-17
4,5,4290,90,Catégorie_73,Boutique_6,5.86,2024-06-12
5,6,98,305,Catégorie_57,Boutique_25,444.04,2024-05-05
6,7,1166,737,Catégorie_77,Boutique_47,15.18,2023-10-05
7,8,4322,116,Catégorie_50,Boutique_35,129.79,2024-03-27
8,9,3391,411,Catégorie_42,Boutique_18,439.39,2024-09-05
9,10,2154,537,Catégorie_27,Boutique_27,345.86,2024-09-08


## 2 - Calculer le panier moyen de chaque client

In [7]:
(
    df
    .groupby("customer_id")
    .agg(
        F.sum("ca").alias("ca_global"),
        F.count_distinct("transaction_id").alias("nb_transactions")
    )
    .withColumn("panier_moyen", F.col("ca_global") / F.col("nb_transactions"))
    .orderBy(F.desc("panier_moyen"))
    .limit(10)
    .toPandas()

)

Unnamed: 0,customer_id,ca_global,nb_transactions,panier_moyen
0,28,6573.72,18,365.206667
1,3613,2916.84,8,364.605
2,3086,4716.99,13,362.845385
3,841,3966.94,11,360.630909
4,389,5401.1,15,360.073333
5,89,2514.39,7,359.198571
6,2755,5342.08,15,356.138667
7,4898,4980.28,14,355.734286
8,2989,7061.59,20,353.0795
9,4281,5555.73,16,347.233125


## 3 - Calculer la récence de chaque client (nombre de jours depuis le premier achat) par rapport à la date maximale

In [8]:
max_date = df.agg(F.max("date")).collect()[0][0]

(
    df
    .groupby(
        "customer_id"
    )
    .agg(
        F.min("date").alias("date_premiere_transaction")
    )
    .withColumn(
        "recence", F.date_diff(F.lit(max_date), "date_premiere_transaction")
    )
    .limit(10)
    .toPandas()
)

Unnamed: 0,customer_id,date_premiere_transaction,recence
0,2122,2023-10-25,329
1,1238,2023-09-23,361
2,471,2023-09-24,360
3,1342,2023-11-19,304
4,1645,2023-10-05,349
5,148,2023-10-04,350
6,2366,2023-11-05,318
7,1088,2023-10-15,339
8,1959,2023-09-23,361
9,4935,2023-09-30,354


## 4 - Proposer un ranking des boutiques en terme de CA

In [9]:
(
    df
    .groupby("boutique")
    .agg(
        F.sum("amount").alias("ca")
    )
    .withColumn("rank", F.rank().over(Window.orderBy(F.desc("ca"))))
    .limit(10)
    .toPandas()
)

Unnamed: 0,boutique,ca,rank
0,Boutique_21,537952.95,1
1,Boutique_29,531017.84,2
2,Boutique_4,525000.14,3
3,Boutique_25,524613.92,4
4,Boutique_1,520827.85,5
5,Boutique_10,520737.08,6
6,Boutique_8,519280.26,7
7,Boutique_31,519062.49,8
8,Boutique_26,518502.26,9
9,Boutique_42,518028.63,10


## 5 - Calculer le pourcentage de CA généré par chaque boutique

In [11]:
ca_global = df.agg(F.sum("amount")).collect()[0][0]

(
    df
    .groupby("boutique")
    .agg(
        F.sum("amount").alias("ca")
    )
    .withColumn("pct_ca", F.col("ca") / F.lit(ca_global))
    .limit(10)
    .toPandas()
)

Unnamed: 0,boutique,ca,pct_ca
0,Boutique_25,524613.92,0.020771
1,Boutique_16,497913.4,0.019714
2,Boutique_7,489332.47,0.019374
3,Boutique_37,510788.85,0.020224
4,Boutique_21,537952.95,0.021299
5,Boutique_23,491407.52,0.019457
6,Boutique_49,496398.46,0.019654
7,Boutique_30,494785.44,0.01959
8,Boutique_15,496594.08,0.019662
9,Boutique_38,491386.78,0.019456
