In [1]:
from pyspark.sql import SparkSession

# Định nghĩa constants
HDFS_HOST = "hdfs://localhost:9000"  
HDFS_PATH = f"{HDFS_HOST}/hadoop/data/parquet/"

def create_spark_session(app_name="IMDb Analytics"):
    """
    Tạo và cấu hình SparkSession với các thiết lập phù hợp.
    
    Parameters:
        app_name (str): Tên của ứng dụng Spark
        
    Returns:
        SparkSession: SparkSession đã được cấu hình
    """
    return SparkSession.builder \
        .appName(app_name) \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.hadoop.fs.defaultFS", HDFS_HOST) \
        .config("spark.sql.warehouse.dir", f"{HDFS_HOST}/user/hive/warehouse") \
        .config("spark.executor.cores", "2") \
        .config("spark.driver.cores", "2") \
        .config("spark.sql.files.maxPartitionBytes", "128MB") \
        .config("spark.sql.shuffle.partitions", "10") \
        .getOrCreate()

# Hàm tiện ích để kiểm tra kết nối HDFS
def test_hdfs_connection(spark):
    """
    Kiểm tra kết nối tới HDFS bằng cách đọc thử một file parquet
    
    Parameters:
        spark (SparkSession): SparkSession đã được khởi tạo
        
    Returns:
        bool: True nếu kết nối thành công, False nếu thất bại
    """
    try:
        # Thử đọc một file parquet bất kỳ
        test_df = spark.read.parquet(f"{HDFS_PATH}/title_basics_parquet")
        test_df.printSchema()
        return True
    except Exception as e:
        print(f"Lỗi kết nối HDFS: {str(e)}")
        return False

In [2]:
class IMDbDataLoader:
    def __init__(self, spark, base_path):
        self.spark = spark
        self.base_path = base_path
    
    def load_titles(self):
        return self.spark.read.parquet(f"{self.base_path}/title_basics_parquet") # basic in4 about titles
    
    def load_ratings(self):
        return self.spark.read.parquet(f"{self.base_path}/title_ratings_parquet") # in4 about ratings and vote counts for titles
    
    def load_names(self):
        return self.spark.read.parquet(f"{self.base_path}/name_basics_parquet") # Basic in4 about individuals

    def load_akas(self):
        return self.spark.read.parquet(f"{self.base_path}/title_akas_parquet") # In4 about alternative titles of movies or shows
        
    def load_episodes(self):
        return self.spark.read.parquet(f"{self.base_path}/title_episode_parquet") # About episodoes in a series

    def load_principals(self):
        return self.spark.read.parquet(f"{self.base_path}/title_principals_parquet") # In4 about key indivisuals related to a title
    
    def load_crews(self):
        return self.spark.read.parquet(f"{self.base_path}/title_crew_parquet") # In4 about the creative team behind the film



In [7]:
from pyspark.sql.functions import col, explode, split, when, count, collect_set, desc

class MovieAnalyzer:
    def __init__(self, movies_df, crew_df, principals_df, ratings_df):
        """
        Initialize with four dataframes:
        - movies_df: Movie information
        - crew_df: Crew information (directors, writers)
        - principals_df: Crew details (roles, jobs)
        - ratings_df: Ratings information
        """
        self.movies_df = movies_df
        self.crew_df = crew_df
        self.principals_df = principals_df
        self.ratings_df = ratings_df

    def get_genre_distribution(self):
        """Analyze movie genre distribution"""
        return self.movies_df.select(
            explode(split("genres", ",")).alias("genre")
        ).groupBy("genre").count().orderBy(desc("count"))
    
    def get_director_productivity(self):
        """Analyze most productive directors"""
        return self.crew_df.select(
            explode(split("directors", ",")).alias("director_id")
        ).filter(
            col("director_id").isNotNull()
        ).groupBy("director_id").count().orderBy(desc("count")).limit(10)
    
    def get_job_distribution(self):
        """Analyze distribution of roles in film crew"""
        return self.principals_df.groupBy("category", "job").count().orderBy(desc("count"))
    
    def get_multi_role_people(self):
        """Find people with multiple roles"""
        return self.principals_df.groupBy("nconst").agg(
            count("category").alias("num_roles"),
            collect_set("category").alias("roles")
        ).orderBy(desc("num_roles")).limit(10)
    
    def get_collaboration_network(self):
        """Analyze collaboration network between directors and producers"""
        directors = self.principals_df.filter(
            col("category") == "director"
        ).select("tconst", "nconst").withColumnRenamed("nconst", "director_id")
        
        producers = self.principals_df.filter(
            col("category") == "producer"
        ).select("tconst", "nconst").withColumnRenamed("nconst", "producer_id")
        
        return directors.join(producers, "tconst").groupBy(
            "director_id", "producer_id"
        ).count().orderBy(desc("count"))

    def get_top_rated_movies(self):
        """Get top rated movies by average rating"""
        return self.movies_df.join(
            self.ratings_df, "tconst"
        ).select("primaryTitle", "averageRating", "numVotes").orderBy(
            desc("averageRating"), desc("numVotes")
        ).limit(10)


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

class MovieVisualizer:
    @staticmethod
    def plot_genre_distribution(genre_df):
        """Plot genre distribution"""
        plt.figure(figsize=(12, 6))
        data = genre_df.toPandas()
        sns.barplot(data=data, x='genre', y='count')
        plt.title("Movie Genre Distribution")
        plt.xticks(rotation=45, ha='right')
        plt.xlabel("Genre")
        plt.ylabel("Count")
        plt.tight_layout()
        return plt.gcf()
    
    @staticmethod
    def plot_job_distribution(job_df):
        """Plot job distribution in film crew"""
        plt.figure(figsize=(15, 8))
        data = job_df.toPandas()
        ax = sns.barplot(data=data, x='category', y='count', hue='job')
        plt.title("Job Distribution in Film Crew")
        plt.xticks(rotation=45, ha='right')
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        return plt.gcf()
    
    @staticmethod
    def plot_director_productivity(director_df):
        """Plot top directors"""
        plt.figure(figsize=(12, 6))
        data = director_df.toPandas()
        sns.barplot(data=data, x='director_id', y='count')
        plt.title("Top Directors by Number of Films")
        plt.xticks(rotation=45, ha='right')
        plt.xlabel("Director ID")
        plt.ylabel("Number of Films")
        plt.tight_layout()
        return plt.gcf()

    @staticmethod
    def plot_top_rated_movies(movie_df):
        """Plot top-rated movies"""
        plt.figure(figsize=(12, 6))
        data = movie_df.toPandas()
        sns.barplot(data=data, x='primaryTitle', y='averageRating')
        plt.title("Top Rated Movies")
        plt.xticks(rotation=45, ha='right')
        plt.xlabel("Movie Title")
        plt.ylabel("Average Rating")
        plt.tight_layout()
        return plt.gcf()


In [3]:
spark = create_spark_session()
# Đọc file Parquet
loader = IMDbDataLoader(spark, "hdfs:///hadoop/data/parquet/")


24/11/25 23:27:31 WARN Utils: Your hostname, duong291-VMware-Virtual-Platform resolves to a loopback address: 127.0.0.1; using 192.168.80.128 instead (on interface ens33)
24/11/25 23:27:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/25 23:27:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
titles_df = loader.load_titles()
rating_df = loader.load_ratings()
names_df = loader.load_names()
akas_df = loader.load_akas()
episode_df = loader.load_episodes()
principal_df = loader.load_principals()
crew_df = loader.load_crews()


In [11]:
print("===================Titles Schema====================")
titles_df.printSchema(5)
titles_df.show(5)
print("===================Rating Schema====================")
rating_df.printSchema(5)
rating_df.show(5)
print("===================Names Schema====================")
names_df.printSchema(5)
names_df.show(5)
print("===================Akas Schema====================")
akas_df.printSchema(5)
akas_df.show(5)
print("===================Episode Schema====================")
episode_df.printSchema(5)
episode_df.show(5)
print("===================Principal Schema====================")
principal_df.printSchema(5)
principal_df.show(5)
print("===================Crew Schema====================")
crew_df.printSchema(5)
crew_df.show(5)


root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|         genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+
| tt7567992|tvEpisode|       Episode #1.39|       Episode #1.39|      0|     2012|   NULL|          NULL|          Drama|
| tt3714186|tvEpisode|       Super Airship|       Super Airship|      0|     2014|   NULL|            44|    Documentary|
|tt29790156|t

In [None]:
# Import MovieAnalyzer and MovieVisualizer classes
# (Assume these are imported or defined in the same script)

# Initialize MovieAnalyzer
analyzer = MovieAnalyzer(titles_df, crew_df, principal_df, rating_df)

# Perform analyses
genre_distribution = analyzer.get_genre_distribution()
director_productivity = analyzer.get_director_productivity()
job_distribution = analyzer.get_job_distribution()
multi_role_people = analyzer.get_multi_role_people()
collaboration_network = analyzer.get_collaboration_network()
top_rated_movies = analyzer.get_top_rated_movies()

# Initialize MovieVisualizer
visualizer = MovieVisualizer()

# Visualize and save the plots
genre_plot = visualizer.plot_genre_distribution(genre_distribution)
genre_plot.savefig("genre_distribution.png")

job_plot = visualizer.plot_job_distribution(job_distribution)
job_plot.savefig("job_distribution.png")

director_plot = visualizer.plot_director_productivity(director_productivity)
director_plot.savefig("director_productivity.png")

top_rated_plot = visualizer.plot_top_rated_movies(top_rated_movies)
top_rated_plot.savefig("top_rated_movies.png")

# Scatter plot for collaboration network
collab_df = collaboration_network.toPandas()
plt.figure(figsize=(12, 8))
plt.scatter(collab_df['director_id'], collab_df['producer_id'], s=collab_df['count'] * 10, alpha=0.6)
plt.title("Director-Producer Collaboration Network")
plt.xlabel("Director ID")
plt.ylabel("Producer ID")
plt.tight_layout()
plt.savefig("collaboration_network.png")

# Print multi-role contributors for quick reference
multi_role_people.show()

# Stop Spark session
spark.stop()

print("Analysis completed. Plots saved as images.")

  plt.tight_layout()
24/11/26 00:05:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 00:05:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                