In [1]:
# Install PySpark
import sys
!{sys.executable} -m pip install pyspark



In [2]:
# Verify Installation
import pyspark
print("PySpark version:", pyspark.__version__)

PySpark version: 4.0.1


In [3]:
!java -version

openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment Homebrew (build 17.0.16+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.16+0, mixed mode, sharing)


In [4]:
# -----------------------------------
# 1. Import Spark and Start Session
# -----------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc

spark = SparkSession.builder \
    .appName("MovieLensEDA_CombineData") \
    .getOrCreate()

print("Spark session created!")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/06 13:51:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created!


In [6]:
# -----------------------------------
# 2. Load Ratings and Movies Data
# -----------------------------------
ratings = spark.read.csv("../data/u.data", sep="\t", header=False, inferSchema=True) \
    .toDF("userId", "movieId", "rating", "timestamp")

movies = spark.read.csv("../data/u.item", sep="|", header=False, inferSchema=True) \
    .selectExpr("_c0 as movieId", "_c1 as title")

print("Ratings Sample:")
ratings.show(5)

print("Movies Sample:")
movies.show(5)


Ratings Sample:
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows
Movies Sample:
+-------+-----------------+
|movieId|            title|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows


In [7]:
# Join Ratings with Movies
ratings_with_titles = ratings.join(movies, on="movieId", how="inner")
ratings_with_titles.show(5)

+-------+------+------+---------+--------------------+
|movieId|userId|rating|timestamp|               title|
+-------+------+------+---------+--------------------+
|    242|   196|     3|881250949|        Kolya (1996)|
|    302|   186|     3|891717742|L.A. Confidential...|
|    377|    22|     1|878887116| Heavyweights (1994)|
|     51|   244|     2|880606923|Legends of the Fa...|
|    346|   166|     1|886397596| Jackie Brown (1997)|
+-------+------+------+---------+--------------------+
only showing top 5 rows


In [8]:
# Compute Average Rating per Movie (Combined Data)
avg_rating = ratings_with_titles.groupBy("movieId", "title") \
                                .agg(avg("rating").alias("avg_rating"),
                                     count("rating").alias("num_ratings")) \
                                .orderBy(desc("num_ratings"))

avg_rating.show(10)

+-------+--------------------+------------------+-----------+
|movieId|               title|        avg_rating|num_ratings|
+-------+--------------------+------------------+-----------+
|     50|    Star Wars (1977)|4.3584905660377355|        583|
|    258|      Contact (1997)|3.8035363457760316|        509|
|    100|        Fargo (1996)| 4.155511811023622|        508|
|    181|Return of the Jed...| 4.007889546351085|        507|
|    294|    Liar Liar (1997)| 3.156701030927835|        485|
|    286|English Patient, ...| 3.656964656964657|        481|
|    288|       Scream (1996)|3.4414225941422596|        478|
|      1|    Toy Story (1995)|3.8783185840707963|        452|
|    300|Air Force One (1997)|3.6310904872389793|        431|
|    121|Independence Day ...| 3.438228438228438|        429|
+-------+--------------------+------------------+-----------+
only showing top 10 rows


In [9]:
# Top Users by Number of Ratings
top_users = ratings.groupBy("userId") \
                   .agg(count("rating").alias("num_ratings")) \
                   .orderBy(desc("num_ratings"))

top_users.show(10)

+------+-----------+
|userId|num_ratings|
+------+-----------+
|   405|        737|
|   655|        685|
|    13|        636|
|   450|        540|
|   276|        518|
|   416|        493|
|   537|        490|
|   303|        484|
|   234|        480|
|   393|        448|
+------+-----------+
only showing top 10 rows


In [10]:
# Stop Spark Session
spark.stop()
print("Spark session stopped.")

Spark session stopped.
