In [0]:
%pip install bs4  # Install BeautifulSoup library for web scraping

In [0]:
# Restart the Python environment to apply changes made by library installations or updates
dbutils.library.restartPython()

In [0]:
import json  # For handling JSON data
import os  # For interacting with the operating system
import requests  # For making HTTP requests
import urllib.request  # For opening and reading URLs
from bs4 import BeautifulSoup  # For parsing HTML and XML documents
from delta.tables import DeltaTable  # For working with Delta Lake tables
from pyspark.sql import Row  # For creating Spark DataFrame rows
from pyspark.sql.functions import col, regexp_replace  # For DataFrame operations

In [0]:
# Retrieve the OMDB API key from Databricks secrets
omdbkey = dbutils.secrets.get(scope = "djsdbsecrets", key = "omdbapikey")

In [0]:
download_path = '/Volumes/generaldata/dataanalysis/upload/imdb/'  # Path to download IMDB data
url = 'https://datasets.imdbws.com/'  # URL for IMDB datasets
catalog_name = "data_analysis"  # Catalog name for the database
schema_name = "imdb_data"  # Schema name for the database
spark.sql("CREATE SCHEMA IF NOT EXISTS {0}.{1};".format(catalog_name, schema_name))  # Create schema if it doesn't exist

In [0]:
html_content = requests.get(url).text  # Fetch HTML content from the specified URL
soup = BeautifulSoup(html_content, 'html.parser')  # Parse the HTML content using BeautifulSoup
items_list = soup.find('ul')  # Locate the unordered list in the parsed HTML
for item in items_list.findAll('a'):  # Iterate through all anchor tags within the list
   file_name = item.getText()  # Extract the text (file name) from the anchor tag
   decompressed_file_name = file_name.replace('.gz', '')  # Remove the .gz extension for the decompressed file name
   file_path = item.get('href')  # Get the href attribute (URL) of the anchor tag
   dest_download_path = "/tmp/{}".format(file_name)  # Define the temporary download path for the file
   urllib.request.urlretrieve(file_path, dest_download_path)  # Download the file to the temporary path
   os.system('gzip -d {}'.format(dest_download_path))  # Decompress the downloaded .gz file
   os.system("cp /tmp/{0} {1}".format(decompressed_file_name, download_path))  # Copy the decompressed file to the final download path

In [0]:
for file in dbutils.fs.ls(download_path):
    table_name = file.name.replace('.tsv', '').replace('.', '_')  # Create table name from file name
    full_table_name = f"{catalog_name}.{schema_name}.{table_name}"  # Construct full table name with catalog and schema
    if spark._jsparkSession.catalog().tableExists(full_table_name):  # Check if the table already exists
        path = file.path  # Get the file path
        df = spark.read.option("delimiter", "\t").option("header", "true").csv(path)  # Read the TSV file into a DataFrame
        df = df.replace(r"\N", None)  # Replace '\N' with None in the DataFrame
        df.write\
          .mode("overwrite")\  # Set write mode to overwrite
          .option("overwriteSchema", "true")\  # Allow schema to be overwritten
          .saveAsTable("{0}.{1}.{2}".format(catalog_name, schema_name, table_name))  # Save DataFrame as a table

In [0]:
imdb_list = spark.sql(
    """
        SELECT
        tconst
        FROM data_analysis.imdb_data.title_basics
        WHERE titleType = 'movie'  -- Filter for movie titles
        AND isAdult = '0'          -- Exclude adult titles
        AND startYear IS NOT NULL  -- Ensure startYear is not null
        AND startYear > 2000 AND startYear <= 2024  -- Limit to movies released between 2001 and 2024
        AND genres <> 'Documentary'  -- Exclude documentary genres
        LIMIT 500  -- Limit the result to 500 entries
    """
)

In [0]:
json_data = []
for imdbid in imdb_list.select('tconst').rdd.flatMap(lambda x: x).collect():  # Iterate over each IMDb ID
    imdb_url = f"http://www.omdbapi.com/?i={imdbid}&apikey={omdbkey}"  # Construct the API URL for the IMDb ID
    response = requests.get(imdb_url)  # Send a GET request to the API
    json_data.append(response.json())  # Append the JSON response to the list

In [0]:
rows = [Row(**json.loads(json.dumps(doc))) for doc in json_data]  # Convert JSON data to Spark Row objects
df = spark.createDataFrame(rows)  # Create a Spark DataFrame from the Row objects
df = df.withColumn(
    "BoxOffice",
    regexp_replace(col("BoxOffice"), "[$,]", "").cast("decimal(10,2)")
)
df.write\
   .mode("overwrite")\
   .option("overwriteSchema", "true")\
   .saveAsTable("data_analysis.imdb_data.omdb_analysis")  # Save DataFrame as a table in the specified database