In [0]:
# Import
from datetime import datetime
from pyspark.sql.functions import lit, col

# Variables
blob_account_name = "streamersdata"
blob_container_name = "staging"
blob_sas_token = "sp=rl&st=2024-11-29T08:42:08Z&se=2024-12-06T16:42:08Z&spr=https&sv=2022-11-02&sr=c&sig=kzw1HEKoFhGoIiU1U3KO%2FkR%2BOThXS0NX71gJiJ2vR1M%3D"

In [0]:
# Check if the token is still valid
expiry_date_str = blob_sas_token.split("&se=")[1].split("&")[0]
expiry_date = datetime.strptime(expiry_date_str, "%Y-%m-%dT%H:%M:%SZ")
current_date = datetime.utcnow()

if current_date > expiry_date:
    raise Exception("The SAS token is not valid anymore.")
else:
    print(f"The SAS token is valid until {expiry_date_str}.")

The SAS token is valid until 2024-12-06T16:42:08Z.


In [0]:
# Mount point
mount_point = f"/mnt/{blob_container_name}"

# Unmount if already mounted
if mount_point in [mnt.mountPoint for mnt in dbutils.fs.mounts()]:
    dbutils.fs.unmount(mount_point)

# Mount the Blob Storage
dbutils.fs.mount(
    source=f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net",
    mount_point=mount_point,
    extra_configs={f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net": blob_sas_token}
)

# List files in the container
display(dbutils.fs.ls(mount_point))

/mnt/staging has been unmounted.


path,name,size,modificationTime
dbfs:/mnt/staging/amazon-prime.csv,amazon-prime.csv,6829543,1732868284000
dbfs:/mnt/staging/apple-tv-plus.csv,apple-tv-plus.csv,1408035,1732868281000
dbfs:/mnt/staging/hbo-max.csv,hbo-max.csv,1106192,1732868281000
dbfs:/mnt/staging/hulu.csv,hulu.csv,689237,1732868281000
dbfs:/mnt/staging/netflix.csv,netflix.csv,5037440,1732868284000


In [0]:
# CSV to DataFrame function
def load_csv_to_df(relative_path):
    csv_file_path = f"{mount_point}/{relative_path}"
    return spark.read.format("csv").option("header", "true").load(csv_file_path)

# Relative paths
amazon_relative_path = "amazon-prime.csv"
apple_relative_path = "apple-tv-plus.csv"
hbo_relative_path = "hbo-max.csv"
hulu_relative_path = "hulu.csv"
netflix_relative_path = "netflix.csv"

# Load DataFrames
amazon_df = load_csv_to_df(amazon_relative_path)
apple_df = load_csv_to_df(apple_relative_path)
hbo_df = load_csv_to_df(hbo_relative_path)
hulu_df = load_csv_to_df(hulu_relative_path)
netflix_df = load_csv_to_df(netflix_relative_path)

# Display one of the DataFrames
#display(amazon_df.limit(10))
#amazon_df.printSchema()


In [0]:
# Add a new column with the platform name
amazon_df = amazon_df.withColumn("platform", lit("Amazon Prime"))

# Remove rows from the "title" column that contain null values. These are unusable in this case.
amazon_df = amazon_df.filter(amazon_df["title"].isNotNull())

# Change the data types of specific columns
amazon_df = amazon_df.withColumn("releaseYear", col("releaseYear").cast("integer"))
amazon_df = amazon_df.withColumn("imdbNumVotes", col("imdbNumVotes").cast("integer"))
amazon_df = amazon_df.withColumn("imdbAverageRating", col("imdbAverageRating").cast("float"))

# Fill null values in multiple columns
amazon_df = amazon_df.fillna({
    "type": "Unknown",
    "genres": "Unknown",
    "releaseYear": 9999,
    "imdbId": "Unknown",
    "imdbAverageRating": -1,
    "imdbNumVotes": 0,
    "availableCountries": "Unknown"
})

# Display the updated DataFrame
#display(amazon_df.limit(10))


In [0]:
# Function to transform DataFrame
def transform_df(df, platform_name):
    df = df.withColumn("platform", lit(platform_name))
    df = df.filter(df["title"].isNotNull())
    df = df.withColumn("releaseYear", col("releaseYear").cast("integer"))
    df = df.withColumn("imdbNumVotes", col("imdbNumVotes").cast("integer"))
    df = df.withColumn("imdbAverageRating", col("imdbAverageRating").cast("float"))
    df = df.fillna({
        "type": "Unknown",
        "genres": "Unknown",
        "releaseYear": 9999,
        "imdbId": "Unknown",
        "imdbAverageRating": -1,
        "imdbNumVotes": 0,
        "availableCountries": "Unknown"
    })
    return df

# Apply the transformation function to each DataFrame
amazon_df_cleaned = transform_df(amazon_df, "Amazon Prime")
apple_df_cleaned = transform_df(apple_df, "Apple TV Plus")
hbo_df_cleaned = transform_df(hbo_df, "HBO Max")
hulu_df_cleaned = transform_df(hulu_df, "Hulu")
netflix_df_cleaned = transform_df(netflix_df, "Netflix")

# Combine the DataFrames
combined_df = amazon_df_cleaned.unionByName(apple_df_cleaned) \
                               .unionByName(hbo_df_cleaned) \
                               .unionByName(hulu_df_cleaned) \
                               .unionByName(netflix_df_cleaned)

# Display one of the transformed DataFrames
#display(amazon_df_cleaned.limit(10))

# Display the combined DataFrame
#display(combined_df.limit(10))


In [0]:
# Define the JDBC URL and connection properties
jdbc_url = "jdbc:sqlserver://streamers-sqlserver.database.windows.net:1433;database=streamers-sqldb"
connection_properties = {
    "user": "Thomas",
    "password": "Gitaar%82",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

# Save the combined DataFrame to the SQL database
combined_df.write.jdbc(url=jdbc_url, table="streaming_data", mode="overwrite", properties=connection_properties)

In [0]:
# Connect to Azure SQL Database

import pyodbc
from sqlalchemy import create_engine

server = 'streamers-sqlserver.database.windows.net'
database = 'streamers-sqldb'
username = 'Thomas'
password = 'Gitaar%82'
driver = 'ODBC Driver 17 for SQL Server'

engine = create_engine(f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver={driver}")

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-2700231208197324>, line 2[0m
[1;32m      1[0m [38;5;28;01mimport[39;00m [38;5;21;01mpyodbc[39;00m
[0;32m----> 2[0m [38;5;28;01mfrom[39;00m [38;5;21;01msqlalchemy[39;00m [38;5;28;01mimport[39;00m create_engine
[1;32m      4[0m server [38;5;241m=[39m [38;5;124m'[39m[38;5;124mstreamers-sqlserver.database.windows.net[39m[38;5;124m'[39m
[1;32m      5[0m database [38;5;241m=[39m [38;5;124m'[39m[38;5;124mstreamers-sqldb[39m[38;5;124m'[39m

[0;31mModuleNotFoundError[0m: No module named 'sqlalchemy'

In [0]:
# Insert into Content
combined_df[['Title', 'Type', 'ReleaseYear', 'IMDbID', 'IMDbAverageRating', 'IMDbNumVotes']].to_sql(
    'Content', con=engine, if_exists='append', index=False
)




[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-1637888490068948>, line 2[0m
[1;32m      1[0m [38;5;66;03m# Insert into Content[39;00m
[0;32m----> 2[0m combined_df[[[38;5;124m'[39m[38;5;124mTitle[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mType[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mReleaseYear[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mIMDbID[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mIMDbAverageRating[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mIMDbNumVotes[39m[38;5;124m'[39m]][38;5;241m.[39mto_sql(
[1;32m      3[0m     [38;5;124m'[39m[38;5;124mContent[39m[38;5;124m'[39m, con[38;5;241m=[39mengine, if_exists[38;5;241m=[39m[38;5;124m'[39m[38;5;124mappend[39m[38;5;124m'[39m, index[38;5;241m=[39m[38;5;28;01mFalse[39;00m
[1;32m      4[0m )

[0;31mNameError[0m: na

In [0]:
# Insert unique genres
#unique_genres = set(genre for genres in content_df['genres'] for genre in genres.split(', '))
#genres_df = pd.DataFrame({'GenreName': list(unique_genres)})
#genres_df.to_sql('Genres', con=engine, if_exists='append', index=False)

# Insert unique countries
#unique_countries = set(country for countries in content_df['availableCountries'] for country in countries.split(', '))
#countries_df = pd.DataFrame({'CountryCode': list(unique_countries)})
#countries_df.to_sql('Countries', con=engine, if_exists='append', index=False)

# Insert into ContentGenres
# Use SQLAlchemy to map ContentID to GenreID

# Insert into ContentCountries
# Use SQLAlchemy to map ContentID to CountryID