In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, format_number, when, trim

base_path = r'G:\.shortcut-targets-by-id\1eDKYURr-Qm222Ul6PoZLc4b99cuwHTC3\Cường\Đại học\DS200\Lab 1'

spark = SparkSession.builder \
    .appName("DS200 Lab1 - Bai 4") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

try:
    movies_temp_df = spark.read.format("csv").option("header", "false") \
        .load(os.path.join(base_path, "movies.cleaned.txt")).toDF("MovieID_str", "Title_str", "Genres_str")
    movies_df = movies_temp_df.select(
        col("MovieID_str").cast("int").alias("MovieID"),
        col("Title_str").alias("Title"),
        col("Genres_str").alias("Genres")
    )

    ratings_temp_df = spark.read.format("csv").option("header", "false") \
        .load(os.path.join(base_path, "ratings.cleaned.txt")).toDF("UserID_str", "MovieID_str", "Rating_str", "Timestamp_str")
    ratings_df = ratings_temp_df.select(
        col("UserID_str").cast("int").alias("UserID"),
        col("MovieID_str").cast("int").alias("MovieID"),
        col("Rating_str").cast("float").alias("Rating"),
        col("Timestamp_str").cast("long").alias("Timestamp")
    )

    users_temp_df = spark.read.format("csv").option("header", "false") \
        .load(os.path.join(base_path, "users.cleaned.txt")).toDF("UserID_str", "Gender_str", "Age_str", "Occupation_str", "ZipCode_str")
    users_df = users_temp_df.select(
        col("UserID_str").cast("int").alias("UserID"),
        trim(col("Gender_str")).alias("Gender"),
        col("Age_str").cast("int").alias("Age"),
        col("Occupation_str").cast("int").alias("Occupation"),
        col("ZipCode_str").alias("ZipCode")
    )

except Exception as e:
    print(f"\nĐÃ CÓ LỖI XẢY RA KHI TẢI DỮ LIỆU: {e}")
    spark.stop()
    exit()

# Join ratings với users để lấy thông tin tuổi

In [2]:
ratings_with_age_df = ratings_df.join(users_df, "UserID")

# Tạo cột AgeGroup dựa trên điều kiện tuổi

In [6]:
from pyspark.sql.functions import when, col

age_groups = ["0-18", "18-35", "35-50", "50+"]

ratings_with_age_group_df = ratings_with_age_df.withColumn(
    "AgeGroup",
    when(col("Age") <= 18, age_groups[0])
    .when((col("Age") > 18) & (col("Age") <= 35), age_groups[1])
    .when((col("Age") > 35) & (col("Age") <= 50), age_groups[2])
    .otherwise(age_groups[3])
)


# Tính avg rating

In [7]:
age_stats_df = ratings_with_age_group_df.groupBy("MovieID") \
        .pivot("AgeGroup", age_groups) \
        .agg(avg("Rating"))


# Join với movies_df để lấy tên phim

In [None]:
final_df = age_stats_df.join(movies_df, "MovieID")

# Hiển thị kết quả

In [11]:
final_df.select(
        col("Title"),
        format_number(col("0-18"), 2).alias("0-18"),
        format_number(col("18-35"), 2).alias("18-35"),
        format_number(col("35-50"), 2).alias("35-50"),
        format_number(col("50+"), 2).alias("50+")
    ).orderBy("Title").na.fill("NA").show(200, truncate=False)



+---------------------------------------------------------+----+-----+-----+---+
|Title                                                    |0-18|18-35|35-50|50+|
+---------------------------------------------------------+----+-----+-----+---+
| American Beauty (1999)                                  |NA  |3.75 |NA   |NA |
| Avatar (2009)                                           |NA  |4.50 |5.00 |NA |
| Back to the Future (1985)                               |NA  |4.00 |NA   |NA |
| Birdman (2014)                                          |NA  |3.00 |NA   |NA |
| Braveheart (1995)                                       |NA  |4.00 |4.50 |NA |
| Coco (2017)                                             |NA  |4.25 |NA   |NA |
| Dunkirk (2017)                                          |NA  |3.50 |NA   |NA |
| Fight Club (1999)                                       |NA  |4.25 |NA   |NA |
| Forrest Gump (1994)                                     |NA  |3.75 |NA   |NA |
| Gladiator (2000)          

In [12]:
spark.stop()
