# LAB 0 -B PYSPARK





In [23]:
!pip install -q pyspark

In [24]:

from pyspark.sql import SparkSession

spark = (
SparkSession.builder
.master("local[*]")
.appName("Lab0-OnlineRetail-Warmup")
.config("spark.ui.showConsoleProgress", "false")
.getOrCreate()

)

In [25]:
!ls -lh


total 51M
-rw-r--r-- 1 root root 7.2M Nov 20 17:35 archive.zip
-rw-r--r-- 1 root root  44M Sep 21  2019 OnlineRetail.csv
drwxr-xr-x 1 root root 4.0K Nov 17 14:29 sample_data


In [26]:
!unzip -o archive.zip


Archive:  archive.zip
  inflating: OnlineRetail.csv        


In [27]:
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType,
    TimestampType, FloatType
)

In [28]:


online_retail_schema = StructType([
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", TimestampType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("CustomerId", IntegerType(), True),
    StructField("Country", StringType(), True),
])



In [29]:
df = (
    spark.read
        .option("header", "true")                 # Le fichier contient des noms de colonnes
        .option("timestampFormat", "M/d/yyyy H:m") # Format des dates dans InvoiceDate
        .schema(online_retail_schema)              # On impose un schéma
        .csv("OnlineRetail.csv")                   # Chemin du fichier
)

In [30]:
df.show(5)


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerId|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



## Exercice 1 - Show 5 descriptions

In [31]:
df.select("Description").show(5, truncate=False)


+-----------------------------------+
|Description                        |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
+-----------------------------------+
only showing top 5 rows



#Exercise 2 — Count distinct invoices

In [32]:
from pyspark.sql.functions import countDistinct

# Using distinct()
distinct_1 = df.select("InvoiceNo").distinct().count()
print("Distinct invoices (method 1):", distinct_1)

# Using countDistinct()
distinct_2 = df.select(countDistinct("InvoiceNo")).collect()[0][0]
print("Distinct invoices (method 2):", distinct_2)


Distinct invoices (method 1): 25900
Distinct invoices (method 2): 25900


#Exercise 3 — Month with most invoices

In [33]:
from pyspark.sql.functions import month, countDistinct

df_month = (
    df.withColumn("InvoiceMonth", month("InvoiceDate"))
      .groupBy("InvoiceMonth")
      .agg(countDistinct("InvoiceNo").alias("NumInvoices"))
      .orderBy("NumInvoices", ascending=False)
)

df_month.show(1)


+------------+-----------+
|InvoiceMonth|NumInvoices|
+------------+-----------+
|          11|       3462|
+------------+-----------+
only showing top 1 row



#Exercise 4 — Filter Quantity > 30

In [34]:
from pyspark.sql.functions import col

df.filter(col("Quantity") > 30).show(10, truncate=False)


+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerId|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT      |32      |2010-12-01 08:34:00|1.69     |13047     |United Kingdom|
|536370   |10002    |INFLATABLE POLITICAL GLOBE         |48      |2010-12-01 08:45:00|0.85     |12583     |France        |
|536370   |22492    |MINI PAINT SET VINTAGE             |36      |2010-12-01 08:45:00|0.65     |12583     |France        |
|536371   |22086    |PAPER CHAIN KIT 50'S CHRISTMAS     |80      |2010-12-01 09:00:00|2.55     |13748     |United Kingdom|
|536374   |21258    |VICTORIAN SEWING BOX LARGE         |32      |2010-12-01 09:09:00|10.95    |15100     |United Kingdom|
|536376   |22114

#Exercise 5 — Top 4 most sold items

In [35]:
from pyspark.sql.functions import sum as spark_sum, desc

df_items = (
    df.groupBy("Description")
      .agg(spark_sum("Quantity").alias("TotalQuantity"))
      .orderBy(desc("TotalQuantity"))
      .limit(4)
)

df_items.show(truncate=False)


+---------------------------------+-------------+
|Description                      |TotalQuantity|
+---------------------------------+-------------+
|WORLD WAR 2 GLIDERS ASSTD DESIGNS|53847        |
|JUMBO BAG RED RETROSPOT          |47363        |
|ASSORTED COLOUR BIRD ORNAMENT    |36381        |
|POPCORN HOLDER                   |36334        |
+---------------------------------+-------------+



# count() vs countDistinct() with null values

count(column) ignores NULL values.
It only counts rows where the column is NOT NULL.

distinct() keeps NULL as a possible value.
A column with NULL will return one NULL in distinct().

countDistinct(column) counts all distinct non-null values,
but does NOT count NULL as a value.

 # Therefore:

df.select("col").distinct().count() → counts NULL as one distinct value

countDistinct("col") → does NOT count NULL

This difference explains why the numbers can differ.

In [36]:
spark.stop()
