# MovieLens - Ingestion Batch et Vérification de la Qualité


## 1. Initialisation de Spark


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Création d'une session Spark
spark = SparkSession.builder \
    .appName("MovieLens Data Ingestion") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print(f"Version de Spark: {spark.version}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/29 23:29:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Version de Spark: 3.5.1


In [None]:
## 2. Vérification HDFS préalable

!hdfs dfs -ls /

## 3. Création du répertoire de destination dans HDFS

!hdfs dfs -mkdir -p /user/movielens/raw
!hdfs dfs -ls /user/movielens

## 4. Upload local → HDFS (depuis /notebooks/data ou autre)
# Remplacer les chemins ci-dessous par le chemin réel dans le container Docker si différent

!hdfs dfs -put -f /data/ratings.csv /user/movielens/raw/
!hdfs dfs -put -f /data/movies.csv /user/movielens/raw/

In [None]:
## 5. Vérification post-upload

print("\nContenu de /user/movielens/raw dans HDFS:")
!hdfs dfs -ls /user/movielens/raw

## 6. Lecture avec Spark (vérification d'intégrité)

ratings_path = "hdfs://namenode:9000/user/movielens/raw/ratings.csv"
movies_path = "hdfs://namenode:9000/user/movielens/raw/movies.csv"


## 2. Chargement des données locales


In [None]:
ratings_df = spark.read.option("header", True).option("inferSchema", True).csv(ratings_path)
movies_df = spark.read.option("header", True).option("inferSchema", True).csv(movies_path)

## 3. Exploration des données


In [6]:
# Affichage des schémas
print("Schéma des notes:")
ratings_df.printSchema()

print("\nSchéma des films:")
movies_df.printSchema()

Schéma des notes:
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)


Schéma des films:
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [7]:
# Affichage des premières lignes
print("Aperçu des notes:")
ratings_df.show(5)

print("\nAperçu des films:")
movies_df.show(5, truncate=False)

Aperçu des notes:
+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows


Aperçu des films:
+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      

In [8]:
# Statistiques descriptives
print("Statistiques sur les notes:")
ratings_df.describe().show()

print("\nNombre total de notes:")
print(ratings_df.count())

print("\nNombre total de films:")
print(movies_df.count())

print("\nNombre d'utilisateurs uniques:")
print(ratings_df.select("userId").distinct().count())

Statistiques sur les notes:


                                                                                

+-------+-----------------+-----------------+------------------+
|summary|           userId|          movieId|            rating|
+-------+-----------------+-----------------+------------------+
|  count|         20000263|         20000263|          20000263|
|   mean|69045.87258292554|9041.567330339605|3.5255285642993797|
| stddev| 40038.6266531621|  19789.477445413| 1.051988919294244|
|    min|                1|                1|               0.5|
|    max|           138493|           131262|               5.0|
+-------+-----------------+-----------------+------------------+


Nombre total de notes:


                                                                                

20000263

Nombre total de films:
27278

Nombre d'utilisateurs uniques:


[Stage 15:>                                                       (0 + 16) / 16]

138493


                                                                                

## 4. Vérification de la qualité des données


In [9]:
# Fonction pour compter les valeurs nulles
def count_nulls(df):
    null_counts = []
    for col_name, dtype in df.dtypes:
        if dtype in ["double", "float"]:
            condition = col(col_name).isNull() | isnan(col(col_name))
        else:
            condition = col(col_name).isNull()
        null_counts.append(count(when(condition, col_name)).alias(col_name))
    return df.select(null_counts)

In [10]:
# Vérification des valeurs manquantes
print("Valeurs manquantes dans les notes:")
count_nulls(ratings_df).show()

print("\nValeurs manquantes dans les films:")
count_nulls(movies_df).show()


Valeurs manquantes dans les notes:


                                                                                

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     0|      0|     0|        0|
+------+-------+------+---------+


Valeurs manquantes dans les films:
+-------+-----+------+
|movieId|title|genres|
+-------+-----+------+
|      0|    0|     0|
+-------+-----+------+



In [11]:
# Vérification des doublons
ratings_count = ratings_df.count()
ratings_distinct_count = ratings_df.distinct().count()
print(f"Nombre de lignes dans ratings: {ratings_count}")
print(f"Nombre de lignes distinctes dans ratings: {ratings_distinct_count}")
print(f"Nombre de doublons dans ratings: {ratings_count - ratings_distinct_count}")

movies_count = movies_df.count()
movies_distinct_count = movies_df.distinct().count()
print(f"Nombre de lignes dans movies: {movies_count}")
print(f"Nombre de lignes distinctes dans movies: {movies_distinct_count}")
print(f"Nombre de doublons dans movies: {movies_count - movies_distinct_count}")

25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/29 23:29:33 WARN RowBasedKeyValueBatch: Calling spill() on

Nombre de lignes dans ratings: 20000263
Nombre de lignes distinctes dans ratings: 20000263
Nombre de doublons dans ratings: 0
Nombre de lignes dans movies: 27278
Nombre de lignes distinctes dans movies: 27278
Nombre de doublons dans movies: 0


## 5. Ingestion des données dans HDFS


In [12]:
# Vérification que HDFS est en cours d'exécution
!hdfs dfs -ls /

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Found 2 items
drwxrwx---   - root supergroup          0 2025-04-29 14:38 /tmp
drwxr-xr-x   - root supergroup          0 2025-04-29 23:24 /user


In [13]:
# Création des répertoires dans HDFS
!hdfs dfs -mkdir -p /user/movielens/raw
!hdfs dfs -m

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
-m: Unknown command
Usage: hadoop fs [generic options]
	[-appendToFile [-n] <localsrc> ... <dst>]
	[-cat [-ignoreCrc] <src> ...]
	[-checksum [-v] <src> ...]
	[-chgrp [-R] GROUP PATH...]
	[-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]
	[-chown [-R] [OWNER][:[GROUP]] PATH...]
	[-concat <target path> <src path> <src path> ...]
	[-copyFromLocal [-f] [-p] [-l] [-d] [-t <thread count>] [-q <thread pool queue size>] <localsrc> ... <dst>]
	[-copyToLocal [-f] [-p] [-crc] [-ignoreCrc] [-t <thread count>] [-q <thread pool queue size>] <src> ... <localdst>]


In [14]:
# Copie des fichiers CSV vers HDFS
!hdfs dfs -put -f {ratings_file} /user/movielens/raw/
!hdfs dfs -put -f {movies_file} /user/movielens/raw/

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
put: `../data/movielens/ratings.csv': No such file or directory
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
put: `../data/movielens/movies.csv': No such file or directory


In [15]:
# Vérification des fichiers dans HDFS
!hdfs dfs -ls /user/movielens/raw

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.


## 6. Vérification des données dans HDFS


In [16]:
# Lecture des données depuis HDFS
ratings_hdfs = spark.read.csv("hdfs://namenode:9000/user/data/movielens/ratings.csv", header=True, inferSchema=True)
movies_hdfs = spark.read.csv("hdfs://namenode:9000/user/data/movielens/movies.csv", header=True, inferSchema=True)

                                                                                

In [17]:
# Vérification du nombre de lignes
print(f"Nombre de lignes dans ratings (HDFS): {ratings_hdfs.count()}")
print(f"Nombre de lignes dans movies (HDFS): {movies_hdfs.count()}")

Nombre de lignes dans ratings (HDFS): 20000263
Nombre de lignes dans movies (HDFS): 27278


                                                                                

## 7. Visualisation de la distribution des notes

In [None]:
# Conversion en pandas pour la visualisation
ratings_pandas = ratings_df.select("rating").toPandas()

                                                                                

In [None]:
# Distribution des notes
plt.figure(figsize=(10, 6))
sns.histplot(ratings_pandas["rating"], bins=10, kde=True)
plt.title('Distribution des notes')
plt.xlabel('Note')
plt.ylabel('Fréquence')
plt.show()

In [None]:
# Fermeture de la session Spark
spark.stop()