# Libraries & variables

In [0]:
# Import
from datetime import datetime
from pyspark.sql.functions import explode, col, lit, split

# Variables
blob_account_name = "streamersdata"
blob_container_name = "staging"
blob_sas_token = "<blob_sas_token>"

In [0]:
# Define the JDBC URL and connection properties
jdbc_url = "jdbc:sqlserver://streamers-sqlserver.database.windows.net:1433;database=streamers-sqldb"
connection_properties = {
    "user": "<USERNAME>",
    "password": "<PASSWORD",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}


In [0]:
# Check if the token is still valid
expiry_date_str = blob_sas_token.split("&se=")[1].split("&")[0]
expiry_date = datetime.strptime(expiry_date_str, "%Y-%m-%dT%H:%M:%SZ")
current_date = datetime.utcnow()

if current_date > expiry_date:
    raise Exception("The SAS token is not valid anymore.")
else:
    print(f"The SAS token is valid until {expiry_date_str}.")

The SAS token is valid until 2024-12-20T17:03:16Z.


# File mounting

In [0]:
# Mount point
mount_point = f"/mnt/{blob_container_name}"

# Unmount if already mounted
if mount_point in [mnt.mountPoint for mnt in dbutils.fs.mounts()]:
    dbutils.fs.unmount(mount_point)

# Mount the Blob Storage
dbutils.fs.mount(
    source=f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net",
    mount_point=mount_point,
    extra_configs={f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net": blob_sas_token}
)

# List files in the container
display(dbutils.fs.ls(mount_point))

/mnt/staging has been unmounted.


path,name,size,modificationTime
dbfs:/mnt/staging/amazon_prime.csv,amazon_prime.csv,6868017,1734100958000
dbfs:/mnt/staging/apple_tv.csv,apple_tv.csv,1394527,1734100958000
dbfs:/mnt/staging/hbo_max.csv,hbo_max.csv,1113776,1734100958000
dbfs:/mnt/staging/hulu.csv,hulu.csv,691846,1734100958000
dbfs:/mnt/staging/netflix.csv,netflix.csv,5044133,1734100959000


# Dataframe creation

In [0]:
# CSV to DataFrame function
def load_csv_to_df(relative_path):
    csv_file_path = f"{mount_point}/{relative_path}"
    return spark.read.format("csv").option("header", "true").load(csv_file_path)

# Relative paths
amazon_relative_path = "amazon_prime.csv"
apple_relative_path = "apple_tv.csv"
hbo_relative_path = "hbo_max.csv"
hulu_relative_path = "hulu.csv"
netflix_relative_path = "netflix.csv"

# Load DataFrames
amazon_df = load_csv_to_df(amazon_relative_path)
apple_df = load_csv_to_df(apple_relative_path)
hbo_df = load_csv_to_df(hbo_relative_path)
hulu_df = load_csv_to_df(hulu_relative_path)
netflix_df = load_csv_to_df(netflix_relative_path)

# Dataframe cleaning & transformation
The dataframes need to be transformed for easy processing.
1. I remove all the records without data. If the columns title, imdbId, imdbAverageRating or imdbNumVotes contain Null the row is deleted.
2. The data in the dataframes are in string format. So I change them to integer or float if needed.
3. Null's in the releaseYear column are replaced with 9999
4. A new column is added with the platform name.
5. Last step is combining. Luckely the five different CSV files follow the same template. So combining them is not to difficult.

In [0]:
# Function to transform DataFrame
def transform_df(df, platform_name):
    df = df.withColumn("platform", lit(platform_name))
    df = df.filter(
        col("title").isNotNull() &
        col("imdbId").isNotNull() &
        col("imdbAverageRating").isNotNull() &
        col("imdbNumVotes").isNotNull()
    )
    df = df.withColumn("releaseYear", col("releaseYear").cast("integer"))
    df = df.withColumn("imdbNumVotes", col("imdbNumVotes").cast("integer"))
    df = df.withColumn("imdbAverageRating", col("imdbAverageRating").cast("float"))
    df = df.fillna({
        "type": "Unknown",
        "genres": "Unknown",
        "releaseYear": 9999,
        "availableCountries": "Unknown"
    })
    return df

# Apply the transformation function to each DataFrame
amazon_df_cleaned = transform_df(amazon_df, "Amazon Prime")
apple_df_cleaned = transform_df(apple_df, "Apple TV Plus")
hbo_df_cleaned = transform_df(hbo_df, "HBO Max")
hulu_df_cleaned = transform_df(hulu_df, "Hulu")
netflix_df_cleaned = transform_df(netflix_df, "Netflix")

# Combine the DataFrames
combined_df = amazon_df_cleaned.unionByName(apple_df_cleaned) \
                               .unionByName(hbo_df_cleaned) \
                               .unionByName(hulu_df_cleaned) \
                               .unionByName(netflix_df_cleaned)


In [0]:
# Display the combined DataFrame
display(combined_df.limit(10))

title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,platform
Blondie,movie,"Comedy, Family",1938,tt0029927,6.9,889,"US, ZA",Amazon Prime
Ariel,movie,"Comedy, Crime, Romance",1988,tt0094675,7.4,8819,JP,Amazon Prime
Four Rooms,movie,Comedy,1995,tt0113101,6.7,112815,"AT, DE",Amazon Prime
Judgment Night,movie,"Action, Crime, Drama",1993,tt0107286,6.6,19366,US,Amazon Prime
Forrest Gump,movie,"Drama, Romance",1994,tt0109830,8.8,2327333,"AD, AT, CU, DE, FR, GF, IN, JP, MC, PF, SN",Amazon Prime
Citizen Kane,movie,"Drama, Mystery",1941,tt0033467,8.3,474976,"AD, CA, ES, IN, JP",Amazon Prime
Dancer in the Dark,movie,"Crime, Drama, Musical",2000,tt0168629,7.9,118972,"DK, FI, NO, SE",Amazon Prime
The Dark,movie,"Drama, Fantasy, Horror",2005,tt0411267,5.3,11567,"CA, US",Amazon Prime
Metropolis,movie,"Drama, Sci-Fi",1927,tt0017136,8.3,190864,"DK, IT, NO, SE, SM, VA",Amazon Prime
My Life Without Me,movie,"Drama, Romance",2003,tt0314412,7.4,26072,"GB, GG",Amazon Prime


### Content DataFrame
The content dataframe contains the unique content from all the platforms combined.

In [0]:
# Select the specified columns from combined_df
content_df = combined_df.select("title", "type", "releaseYear", "imdbId", "imdbAverageRating", "imdbNumVotes").distinct()

# Display the content_df DataFrame
display(content_df.limit(5))

title,type,releaseYear,imdbId,imdbAverageRating,imdbNumVotes
Ariel,movie,1988,tt0094675,7.4,8819
Blondie,movie,1938,tt0029927,6.9,889
Forrest Gump,movie,1994,tt0109830,8.8,2327333
Four Rooms,movie,1995,tt0113101,6.7,112815
Judgment Night,movie,1993,tt0107286,6.6,19366


### Genre DataFrame
The genre dataframe contains the unique genres from all the combined dataframe.

In [0]:

# Explode genres into multiple rows
genre_df = combined_df.select(
    col("title"),
    explode(split(col("genres"), ",\\s*")).alias("GenreName")
)

# Extract unique genres
unique_genres_df = genre_df.select("GenreName").distinct()

unique_genres_df.show(5)

+---------+
|GenreName|
+---------+
|    Crime|
|  Romance|
|    Drama|
|   Family|
|   Comedy|
+---------+
only showing top 5 rows



### Country DataFrame
The country dataframe contains the unique countries from the combined dataframe.

In [0]:
# Explode countries into multiple rows
country_df = combined_df.select(
    col("title"),
    explode(split(col("availableCountries"), ",\\s*")).alias("CountryCode")
)

# Extract unique countries
unique_countries_df = country_df.select("CountryCode").distinct()
unique_countries_df.show(5)

+-----------+
|CountryCode|
+-----------+
|         PF|
|         AT|
|         AD|
|         DE|
|         ZA|
+-----------+
only showing top 5 rows



### Platform DataFrame
The platform dataframe contains the unique countries from the Combined dataframe.

In [0]:
# Extract unique platforms
platform_df = combined_df.select(
    col("title"),
    col("platform").alias("platformName")
)

# Extract unique genres
unique_platform_df = platform_df.select("platformName").distinct()
unique_platform_df.show(5)

+-------------+
| platformName|
+-------------+
| Amazon Prime|
|Apple TV Plus|
|      HBO Max|
|         Hulu|
|      Netflix|
+-------------+



# Loading dataframes to SQL Server

To make sure only new data is loaded on to the server the exsisting data is first put in a dataframe and joined with the new dataframe. This is done for the content, genres and country dataframes.

Next ContentGenres and ContentCountries tables are created.

In [0]:
# Content table
# Load Existing Table from SQL Server
existing_content_df = spark.read.jdbc(
    url=jdbc_url,
    table="Content",
    properties=connection_properties
)

# Identify new records
new_content_records_df = content_df.join(
    existing_content_df, on="imdbID", how="left_anti"
)

# Write New Records to the SQL Table
new_content_records_df.write.jdbc(
    url=jdbc_url,
    table="Content",
    mode="append",
    properties=connection_properties
)

In [0]:
# Countries table
# Load Existing table from SQL Server
existing_countries_df = spark.read.jdbc(
    url=jdbc_url,
    table="Countries",
    properties=connection_properties
)

# Identify new records
new_country_records_df = unique_countries_df.join(
    existing_countries_df, on="CountryCode", how="left_anti"
)

# Write New Records to the SQL Table
new_country_records_df.write.jdbc(
    url=jdbc_url,
    table="Countries",
    mode="append",
    properties=connection_properties
)


In [0]:
# Genres table
# Load Existing table from SQL Server
existing_genres_df = spark.read.jdbc(
    url=jdbc_url,
    table="Genres",
    properties=connection_properties
)

# Identify new records
new_genre_records_df = unique_genres_df.join(
    existing_genres_df, on="GenreName", how="left_anti"
)

# Write New Records to the SQL Table
new_genre_records_df.write.jdbc(
    url=jdbc_url,
    table="Genres",
    mode="append",
    properties=connection_properties
)

In [0]:
# Platform table
# Load Existing table from SQL Server
existing_platform_df = spark.read.jdbc(
    url=jdbc_url,
    table="Platforms",
    properties=connection_properties
)

# Identify new records
new_platform_records_df = unique_platform_df.join(
    existing_platform_df, on="platformName", how="left_anti"
)

# Write New Records to the SQL Table
new_platform_records_df.write.jdbc(
    url=jdbc_url,
    table="Platforms",
    mode="append",
    properties=connection_properties
)

In [0]:

# ContentGenres table

# Explode genres into multiple rows and select the required columns
content_genres_df = combined_df.select(
    col("imdbId").alias("contentID"), 
    explode(split(col("genres"), ",")).alias("genreName")
).distinct()

# Create or replace a temporary view
content_genres_df.createOrReplaceTempView("temp_content_genres")

# Create temporary views for existing tables
existing_content_df.createOrReplaceTempView("Content")
existing_genres_df.createOrReplaceTempView("Genres")

# Perform the join and select the required columns
content_genres_result_df = spark.sql("""
SELECT c.contentID, g.genreID
FROM temp_content_genres cg
JOIN Content c ON c.imdbId = cg.contentID
JOIN Genres g ON g.genreName = cg.genreName
""")

# Write the result DataFrame to the SQL Server table
content_genres_result_df.write.jdbc(
    url=jdbc_url,
    table="ContentGenres",
    mode="overwrite",
    properties=connection_properties
)

In [0]:

# ContentCountries table

# Explode countries into multiple rows and select the required columns
content_countries_df = combined_df.select(
    col("imdbId").alias("contentID"), 
    explode(split(col("availableCountries"), ",")).alias("countryCode")
).distinct()

# Create or replace a temporary view
content_countries_df.createOrReplaceTempView("temp_content_countries")

# Create temporary views for existing tables
existing_content_df.createOrReplaceTempView("Content")
existing_countries_df.createOrReplaceTempView("Countries")

# Perform the join and select the required columns
content_countries_result_df = spark.sql("""
SELECT c.contentID, co.countryID
FROM temp_content_countries cc
JOIN Content c ON c.imdbId = cc.contentID
JOIN Countries co ON co.countryCode = cc.countryCode
""")

# Write the result DataFrame to the SQL Server table
content_countries_result_df.write.jdbc(
    url=jdbc_url,
    table="ContentCountries",
    mode="overwrite",
    properties=connection_properties
)

In [0]:

# ContentPlatform table

# Explode platforms into multiple rows and select the required columns
content_platform_df = combined_df.select(
    col("imdbId").alias("contentID"), 
    col("platform").alias("platformName")
).distinct()

# Create or replace a temporary view
content_platform_df.createOrReplaceTempView("temp_content_platforms")

# Create temporary views for existing tables
existing_content_df.createOrReplaceTempView("Content")
existing_platform_df.createOrReplaceTempView("Platforms")

# Perform the join and select the required columns
content_platform_result_df = spark.sql("""
SELECT c.contentID, p.platformID
FROM temp_content_platforms cp
JOIN Content c ON c.imdbId = cp.contentID
JOIN Platforms p ON p.platformName = cp.platformName
""")

# Write the result DataFrame to the SQL Server table
content_platform_result_df.write.jdbc(
    url=jdbc_url,
    table="ContentPlatform",
    mode="overwrite",
    properties=connection_properties
)