In [0]:
%pip install bs4

In [0]:
dbutils.library.restartPython()

In [0]:
import os
import pandas as pd
import requests
import shutil
import urllib.request
from bs4 import BeautifulSoup
from delta.tables import DeltaTable
from pyspark.sql.functions import col, countDistinct, count, desc

In [0]:
download_path = '/Volumes/generaldata/dataanalysis/upload/imdb/'
url = 'https://datasets.imdbws.com/'
catalog_name = "data_analysis"
schema_name="imdb_data"
spark.sql("CREATE SCHEMA IF NOT EXISTS {0}.{1};".format(catalog_name,schema_name))

In [0]:
html_content = requests.get(url).text
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the list by its ID
items_list = soup.find('ul')
for item in items_list.findAll('a'):
   file_name =item.getText()
   decompressed_file_name = file_name.replace('.gz', '')
   file_path = item.get('href')
   dest_download_path = "/tmp/{}".format(file_name)
   urllib.request.urlretrieve(file_path, dest_download_path)
   os.system('gzip -d {}'.format(dest_download_path))
   os.system("cp /tmp/{0} {1}".format(decompressed_file_name, download_path))

In [0]:
table_pk =[]
for t in spark.catalog.listTables("data_analysis.imdb_data"):
    pk_data = {}
    table = f"{t.catalog}.{''.join(t.namespace).replace('[', '').replace(']', '').replace(' ', ']')}.{t.name}"
    df = spark.table(table)

    columns = df.columns
    analysis_results = []

    for column in columns:
        distinct_count = df.select(countDistinct(col(column))).collect()[0][0]
        null_count = df.filter(col(column).isNull()).count()
        analysis_results.append((column, distinct_count, null_count))

    # Convert the results to a DataFrame for better visualization
    analysis_df = spark.createDataFrame(analysis_results, ["Column", "DistinctCount", "NullCount"])

    # Select the column with the maximum DistinctCount
    max_distinct_column = analysis_df.orderBy(desc("DistinctCount")).select("Column").first()[0]
    pk_data['table'] = table
    pk_data['pk'] = max_distinct_column
    table_pk.append(pk_data)

In [0]:
for file in dbutils.fs.ls(download_path):
    table_name = file.name.replace('.tsv', '').replace('.', '_')
    full_table_name = f"{catalog_name}.{schema_name}.{table_name}"
    if spark._jsparkSession.catalog().tableExists(full_table_name):
        path = file.path
        df = spark.read.option("delimiter", "\t").option("header", "true").csv(path)   
        df.write\
          .mode("overwrite")\
          .option("overwriteSchema", "true")\
          .saveAsTable("{0}.{1}.{2}".format(catalog_name,schema_name,table_name))