In [0]:
%pip uninstall -y databricks_helpers 
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers 

In [0]:
exercise_name = "final_day_presentation"

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers

helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

current_user = helpers.current_user()
working_directory = helpers.working_directory()

print(f"Your current working directory is: {working_directory}")

In [0]:
# Read raw file from Bronze.

bronze_layer_path = working_directory + "/bronze"

silver_df = spark\
    .read\
    .parquet(bronze_layer_path)

print(f"Schema of the raw DataFrame:")
silver_df.printSchema()
display(silver_df)

# Issues with the Dataset

1. Cost_for_two column contains alphanumeric values.
2. Total_ratings column contains alphanumeric values. 
3. Delivery_time, Min_Delivery_time and Max_Delivery_time have same values.
4. Remove duplicate values from Restaurant names.

**NOTE**: For the null values the data transformation will be handled under Gold layer.

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType

# --- Issue 1 & 2: Clean 'Cost_for_two' and 'Total_ratings' columns ---
# These columns are strings containing numbers. We extract the digits and cast them to Integer.
silver_df = silver_df.withColumn(
    "Cost_for_two",
    F.regexp_extract(F.col("Cost_for_two"), r"(\d+)", 1).cast(IntegerType())
).withColumn(
    "Total_ratings",
    F.regexp_extract(F.col("Total_ratings"), r"(\d+)", 1).cast(IntegerType())
)

# --- Issue 3: Remove redundant delivery time columns ---
# Since Delivery_time, Min_Delivery_time, and Max_Delivery_time are the same,
# we can drop the min and max columns to reduce redundancy.
# we also drop the duplicates from Name column.
print("\nDropping redundant delivery time columns...")
silver_df = silver_df.drop("Min_Delivery_time", "Max_Delivery_time").dropDuplicates(["Name"])

# --- Final Clean DataFrame ---
print("\nSchema of the cleaned DataFrame:")
silver_df.printSchema()

print("\nSample of cleaned data:")
display(silver_df)



In [0]:
from pyspark.sql.functions import regexp_replace, split, explode

silver_df = silver_df.withColumn("CuisineList", regexp_replace(regexp_replace("Cuisine", "' '|'  '", ","), "\['|'\]", "")).drop("Cuisine")
display(silver_df.limit(10))

In [0]:
# validatr unique values for the columns
for col_name in silver_df.columns:
  print(f"Distinct values in column '{col_name}':")
  display(silver_df.select(col_name).distinct())

In [0]:
output_dir = working_directory + "/silver"

silver_df.write.mode("overwrite").parquet(output_dir)