# MovieLens - Ingestion Batch et Vérification de la Qualité


# ETL Pipeline - MovieLens Dataset avec PySpark


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, count


In [None]:
# 1. Initialisation de Spark
spark = SparkSession.builder \
    .appName("MovieLens ETL") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/29 23:29:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Version de Spark: 3.5.1


In [None]:
# 2. Chargement des fichiers CSV depuis HDFS
ratings_path = "hdfs://namenode:9000/user/movielens/raw/ratings.csv"
movies_path = "hdfs://namenode:9000/user/movielens/raw/movies.csv"

ratings_df = spark.read.option("header", True).option("inferSchema", True).csv(ratings_path)
movies_df = spark.read.option("header", True).option("inferSchema", True).csv(movies_path)

In [None]:
# 3. Exploration rapide
print("Ratings schema:")
ratings_df.printSchema()
print("Movies schema:")
movies_df.printSchema()

Le fichier ratings.csv existe: False
Le fichier movies.csv existe: False


In [None]:
# 4. Vérification des valeurs nulles
print("\nValeurs nulles dans ratings:")
rating_nulls = ratings_df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in ratings_df.columns])
rating_nulls.show()

print("\nValeurs nulles dans movies:")
movie_nulls = movies_df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in movies_df.columns])
movie_nulls.show()

                                                                                

In [None]:
# 5. Suppression des valeurs nulles
ratings_df = ratings_df.dropna()
movies_df = movies_df.dropna()

Schéma des notes:
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)


Schéma des films:
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
# 6. Suppression des doublons
ratings_df = ratings_df.dropDuplicates()
movies_df = movies_df.dropDuplicates()

Aperçu des notes:
+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows


Aperçu des films:
+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      

In [None]:
# 7. Filtrage des notes extrêmes
ratings_df = ratings_df.filter((col("rating") >= 0.5) & (col("rating") <= 5.0))

# 8. Jointure pour enrichir les données
ratings_enriched_df = ratings_df.join(movies_df, on="movieId", how="inner")

# 9. Partitionnement par userId et sauvegarde dans HDFS (parquet)
ratings_enriched_df.write \
    .mode("overwrite") \
    .partitionBy("userId") \
    .parquet("hdfs://namenode:9000/user/movielens/clean/ratings_by_user")

print("\n✅ Pipeline ETL terminé. Données sauvegardées dans /user/movielens/clean/ratings_by_user")

Statistiques sur les notes:


                                                                                

+-------+-----------------+-----------------+------------------+
|summary|           userId|          movieId|            rating|
+-------+-----------------+-----------------+------------------+
|  count|         20000263|         20000263|          20000263|
|   mean|69045.87258292554|9041.567330339605|3.5255285642993797|
| stddev| 40038.6266531621|  19789.477445413| 1.051988919294244|
|    min|                1|                1|               0.5|
|    max|           138493|           131262|               5.0|
+-------+-----------------+-----------------+------------------+


Nombre total de notes:


                                                                                

20000263

Nombre total de films:
27278

Nombre d'utilisateurs uniques:


[Stage 15:>                                                       (0 + 16) / 16]

138493


                                                                                