In [1]:
# Install PySpark
import sys
!{sys.executable} -m pip install pyspark



In [2]:
# Verify Installation
import pyspark
print("PySpark version:", pyspark.__version__)

PySpark version: 4.0.1


In [3]:
!java -version

openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment Homebrew (build 17.0.16+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.16+0, mixed mode, sharing)


In [4]:
# -----------------------------------
# 1. Import Spark and Start Session
# -----------------------------------
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MovieLensEDA_LoadInspect") \
    .getOrCreate()

print("Spark session created!")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/02 16:57:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created!


In [9]:
# -----------------------------------
# 2. Load Ratings Data
# -----------------------------------
# File: u.data --> userId, movieId, rating, timestamp
ratings = spark.read.csv("../data/u.data",
                         sep="\t",
                         header=False,
                         inferSchema=True) \
    .toDF("userId", "movieId", "rating", "timestamp")

print("Ratings Schema:")
ratings.printSchema()

print("Sample Ratings:")
ratings.show(5)

# Notes:
# .toDF("userId", "movieId", "rating", "timestamp") → Renames the columns after reading, because the file doesn’t have headers.
# ratings is now a PySpark DataFrame with 4 columns: userId, movieId, rating, timestamp.


Ratings Schema:
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)

Sample Ratings:
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows


In [12]:
# -----------------------------------
# 3. Load Movies Data
# -----------------------------------
# File: u.item --> movieId | title | other metadata...
movies = spark.read.csv("../data/u.item",
                        sep="|",
                        header=False,
                        inferSchema=True) \
    .selectExpr("_c0 as movieId", "_c1 as title")

print("Movies Schema:")
movies.printSchema()

print("Sample Movies:")
movies.show(5)

# Notes:
#.selectExpr("_c0 as movieId", "_c1 as title") → Selects only the first two columns:
    # _c0 → renamed to movieId
    #_c1 → renamed to title
#movies DataFrame contains 2 columns only: movieId and title.

Movies Schema:
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)

Sample Movies:
+-------+-----------------+
|movieId|            title|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows


In [14]:
# -----------------------------------
# 4. Basic Dataset Stats
# -----------------------------------
num_ratings = ratings.count()
num_movies = movies.count()
num_users = ratings.select("userId").distinct().count()

print(f"Total Ratings: {num_ratings}")
print(f"Total Movies: {num_movies}")
print(f"Total Users: {num_users}")

Total Ratings: 100000
Total Movies: 1682
Total Users: 943


In [15]:
# -----------------------------------
# 5. Stop Spark Session
# -----------------------------------
spark.stop()
print("Spark session stopped.")

Spark session stopped.
