In [0]:
from pyspark.sql.functions import *

### Tiền xử lý và chuẩn hóa dữ liệu trước khi đưa vào lớp Silver

In [0]:
df_bronze_titles = spark.table("netflix_catalog.bronze.netflix_titles")

df_silver_titles = df_bronze_titles \
    .withColumn("rating", trim(col("rating"))) \
    .withColumn("title", trim(col("title"))) \
    .fillna({
        'duration_minutes': '0', 
        'duration_seasons': '1', 
        'type': 'No Data', 
        'date_added': 'No Data', 
        'rating': 'No Data', 
        'description': 'No Data'})\
    .dropna(subset=["show_id", "title"]) \
    .drop("_rescued_data")

df_silver_titles.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_titles")

In [0]:
df_bronze_category = spark.table("netflix_catalog.bronze.netflix_category")

df_silver_category = df_bronze_category \
    .withColumn("listed_in", trim(col("listed_in"))) \
    .dropDuplicates(["show_id", "listed_in"]) \
    .na.drop(subset=["show_id", "listed_in"]) \
    .drop("_rescued_data")

# Lưu bảng silver đã join
df_silver_category.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_category")


In [0]:
df_cast_bronze = spark.table("netflix_catalog.bronze.netflix_cast")

# Chuẩn hóa
df_cast_silver = df_cast_bronze \
    .withColumn("cast", trim(col("cast"))) \
    .dropDuplicates(["show_id", "cast"]) \
    .na.drop(subset=["show_id", "cast"]) \
    .drop("_rescued_data")

# Lưu bảng Silver
df_cast_silver.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_cast")

In [0]:
# Đọc bảng Bronze
df_directors_bronze = spark.table("netflix_catalog.bronze.netflix_directors")

# Chuẩn hóa
df_directors_silver = df_directors_bronze \
    .withColumn("director", trim(col("director"))) \
    .dropDuplicates(["show_id", "director"]) \
    .na.drop(subset=["show_id", "director"]) \
    .drop("_rescued_data")

# Lưu bảng Silver
df_directors_silver.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_directors")

In [0]:
# Đọc bảng Bronze
df_countries_bronze = spark.table("netflix_catalog.bronze.netflix_countries")

# Chuẩn hóa
df_countries_silver = df_countries_bronze \
    .withColumn("country", trim(col("country"))) \
    .dropDuplicates(["show_id", "country"]) \
    .na.drop(subset=["show_id", "country"]) \
    .drop("_rescued_data")

# Lưu bảng Silver
df_countries_silver.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_countries")


In [0]:
df_titles_with_category = spark.table("netflix_catalog.silver.netflix_titles") \
    .join(spark.table("netflix_catalog.silver.netflix_category"), on="show_id", how="inner")

df_titles_with_category.write.format("delta")\
                .mode("overwrite")\
                .saveAsTable("netflix_catalog.silver.netflix_titles_with_category")

### Kiểm tra các bảng vừa đưa vào lớp Silver

In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_countries LIMIT 10

country,show_id
United States,80151384
United States,70013050
United States,70100379
Spain,80990336
United States,80191804
United States,60027695
Spain,80232891
Canada,70243446
India,80170483
Canada,60036237


In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_category LIMIT 10

listed_in,show_id
Comedies,81035882
Comedies,80236278
Cult Movies,372195
Action & Adventure,80013773
Action & Adventure,80997861
Dramas,81035851
Kids' TV,80057611
Romantic Movies,70184050
Dramas,60000545
TV Dramas,80178687


In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_cast LIMIT 10

cast,show_id
Nancy McKeon,70206822
Tatsuomi Hamada,80213536
Lance Lewman,70265211
Park Hae-il,81026705
Jung Yu-mi,80214523
Irrfan Khan,70059328
Morgan Saylor,80098467
Tiler Peck,80173625
Cherif Hafez,81049674
Jérôme Niel,80992365


In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_directors LIMIT 10

director,show_id
Scott Stewart,70117305
David Sington,80099305
Alex Garland,80023689
Peyton Reed,70042688
Rajat Kapoor,81067759
Daniel Burman,80104237
Vikramaditya Motwane,80115328
Jørgen Lerdam,81016361
Nishanth Ravindaran,81035104
"""Sam """"Blitz"""" Bazawule""",81044496


In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_titles LIMIT 10

duration_minutes,duration_seasons,type,title,date_added,release_year,rating,description,show_id
90,1,Movie,Norm of the North: King Sized Adventure,9/9/2019,2019,TV-PG,"Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.",81145628
94,1,Movie,Jandino: Whatever it Takes,9/9/2016,2016,TV-MA,"""Jandino Asporaat riffs on the challenges of raising kids and serenades the audience with a rousing rendition of """"Sex on Fire"""" in his comedy show.""",80117401
0,1,TV Show,Transformers Prime,9/8/2018,2013,TV-Y7-FV,"With the help of three human allies, the Autobots once again protect Earth from the onslaught of the Decepticons and their leader, Megatron.",70234439
0,1,TV Show,Transformers: Robots in Disguise,9/8/2018,2016,TV-Y7,"When a prison ship crash unleashes hundreds of Decepticons on Earth, Bumblebee leads a new Autobot force to protect humankind.",80058654
99,1,Movie,#realityhigh,9/8/2017,2017,TV-14,"When nerdy high schooler Dani finally attracts the interest of her longtime crush, she lands in the cross hairs of his ex, a social media celebrity.",80125979
0,1,TV Show,Apaches,9/8/2017,2016,TV-MA,A young journalist is forced into a life of crime to save his father and family in this series based on the novel by Miguel Sáez Carral.,80163890
110,1,Movie,Automata,9/8/2017,2014,R,"In a dystopian future, an insurance adjuster for a tech company investigates a robot killed for violating protocol and discovers a global conspiracy.",70304989
60,1,Movie,Fabrizio Copano: Solo pienso en mi,9/8/2017,2017,TV-MA,"Fabrizio Copano takes audience participation to the next level in this stand-up set while reflecting on sperm banks, family WhatsApp groups and more.",80164077
0,1,TV Show,Fire Chasers,9/8/2017,2017,TV-MA,"As California's 2016 fire season rages, brave backcountry firefighters race to put out the flames, protect homes and save lives in this docuseries.",80117902
90,1,Movie,Good People,9/8/2017,2014,R,A struggling couple can't believe their luck when they find a stash of money in the apartment of a neighbor who was recently murdered.,70304990


In [0]:
%sql
SELECT * FROM netflix_catalog.silver.netflix_titles_with_category LIMIT 10

show_id,duration_minutes,duration_seasons,type,title,date_added,release_year,rating,description,listed_in
81145628,90,1,Movie,Norm of the North: King Sized Adventure,9/9/2019,2019,TV-PG,"Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.",Comedies
80117401,94,1,Movie,Jandino: Whatever it Takes,9/9/2016,2016,TV-MA,"""Jandino Asporaat riffs on the challenges of raising kids and serenades the audience with a rousing rendition of """"Sex on Fire"""" in his comedy show.""",Stand-Up Comedy
70234439,0,1,TV Show,Transformers Prime,9/8/2018,2013,TV-Y7-FV,"With the help of three human allies, the Autobots once again protect Earth from the onslaught of the Decepticons and their leader, Megatron.",Kids' TV
80058654,0,1,TV Show,Transformers: Robots in Disguise,9/8/2018,2016,TV-Y7,"When a prison ship crash unleashes hundreds of Decepticons on Earth, Bumblebee leads a new Autobot force to protect humankind.",Kids' TV
80125979,99,1,Movie,#realityhigh,9/8/2017,2017,TV-14,"When nerdy high schooler Dani finally attracts the interest of her longtime crush, she lands in the cross hairs of his ex, a social media celebrity.",Comedies
80163890,0,1,TV Show,Apaches,9/8/2017,2016,TV-MA,A young journalist is forced into a life of crime to save his father and family in this series based on the novel by Miguel Sáez Carral.,Crime TV Shows
70304989,110,1,Movie,Automata,9/8/2017,2014,R,"In a dystopian future, an insurance adjuster for a tech company investigates a robot killed for violating protocol and discovers a global conspiracy.",Sci-Fi & Fantasy
80164077,60,1,Movie,Fabrizio Copano: Solo pienso en mi,9/8/2017,2017,TV-MA,"Fabrizio Copano takes audience participation to the next level in this stand-up set while reflecting on sperm banks, family WhatsApp groups and more.",Stand-Up Comedy
80117902,0,1,TV Show,Fire Chasers,9/8/2017,2017,TV-MA,"As California's 2016 fire season rages, brave backcountry firefighters race to put out the flames, protect homes and save lives in this docuseries.",Science & Nature TV
70304990,90,1,Movie,Good People,9/8/2017,2014,R,A struggling couple can't believe their luck when they find a stash of money in the apartment of a neighbor who was recently murdered.,Thrillers
