In [2]:
# Install PySpark
import sys
!{sys.executable} -m pip install pyspark



In [3]:
# Verify Installation
import pyspark
print("PySpark version:", pyspark.__version__)

PySpark version: 4.0.1


In [4]:
!java -version

openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment Homebrew (build 17.0.16+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.16+0, mixed mode, sharing)


In [1]:
# -----------------------------------
# 1. Import Spark and Start Session
# -----------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc

spark = SparkSession.builder \
    .appName("MovieLensEDA_BasicAggregations") \
    .getOrCreate()

print("Spark session created!")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/02 17:15:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created!


In [5]:
# -----------------------------------
# 2. Load Ratings and Movies Data
# -----------------------------------
ratings = spark.read.csv("../data/u.data", sep="\t", header=False, inferSchema=True) \
    .toDF("userId", "movieId", "rating", "timestamp")

movies = spark.read.csv("../data/u.item", sep="|", header=False, inferSchema=True) \
    .selectExpr("_c0 as movieId", "_c1 as title")

print("Ratings Sample:")
ratings.show(5)

print("Movies Sample:")
movies.show(5)

Ratings Sample:
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows
Movies Sample:
+-------+-----------------+
|movieId|            title|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows


In [6]:
# Count Ratings per Movie
ratings_per_movie = ratings.groupBy("movieId") \
                    .agg(count("rating").alias("numRatings")) \
                    .orderBy(desc("numRatings"))

ratings_per_movie.show(10)
                                    

+-------+----------+
|movieId|numRatings|
+-------+----------+
|     50|       583|
|    258|       509|
|    100|       508|
|    181|       507|
|    294|       485|
|    286|       481|
|    288|       478|
|      1|       452|
|    300|       431|
|    121|       429|
+-------+----------+
only showing top 10 rows


In [10]:
# Average Rating per Movie
avg_rating_per_movie = ratings.groupBy("movieId") \
                    .agg(avg("rating").alias("avgRatings")) \
                    .orderBy(desc("avgRatings"))

avg_rating_per_movie.show(10)

+-------+----------+
|movieId|avgRatings|
+-------+----------+
|   1599|       5.0|
|   1500|       5.0|
|   1201|       5.0|
|   1653|       5.0|
|   1122|       5.0|
|   1467|       5.0|
|   1189|       5.0|
|   1293|       5.0|
|   1536|       5.0|
|    814|       5.0|
+-------+----------+
only showing top 10 rows


In [11]:
# Top Movies by Rating Count & Join with Titles
top_movies = ratings_per_movie.join(movies, on="movieId", how="inner") \
                              .select("movieId", "numRatings", "title") \
                              .orderBy(desc("numRatings"))

top_movies.show(10)


+-------+----------+--------------------+
|movieId|numRatings|               title|
+-------+----------+--------------------+
|     50|       583|    Star Wars (1977)|
|    258|       509|      Contact (1997)|
|    100|       508|        Fargo (1996)|
|    181|       507|Return of the Jed...|
|    294|       485|    Liar Liar (1997)|
|    286|       481|English Patient, ...|
|    288|       478|       Scream (1996)|
|      1|       452|    Toy Story (1995)|
|    300|       431|Air Force One (1997)|
|    121|       429|Independence Day ...|
+-------+----------+--------------------+
only showing top 10 rows


In [12]:
# Ratings Distribution per User
ratings_per_user = ratings.groupBy("userId") \
                    .agg(count("rating").alias("numRatings")) \
                    .orderBy(desc("numRatings"))

ratings_per_user.show(10)

+------+----------+
|userId|numRatings|
+------+----------+
|   405|       737|
|   655|       685|
|    13|       636|
|   450|       540|
|   276|       518|
|   416|       493|
|   537|       490|
|   303|       484|
|   234|       480|
|   393|       448|
+------+----------+
only showing top 10 rows


In [13]:
# Stop Spark Session
spark.stop()
print("Spark session stopped.")

Spark session stopped.
