In [0]:
%pip uninstall -y databricks_helpers 
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers 

In [0]:
exercise_name = "final_day_presentation"
     

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers

helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

current_user = helpers.current_user()
working_directory = helpers.working_directory()

print(f"Your current working directory is: {working_directory}")
     

In [0]:
# Read raw file from Bronze

silver_layer_path = working_directory + "/silver"

gold_df = spark\
    .read\
    .parquet(silver_layer_path)

print(f"Schema of the raw DataFrame:")
gold_df.printSchema()
display(gold_df.limit(100))

##Relationship between Delivery time and Average Rating
Does delivery time have a significant impact on a restaurant's average rating?

In [0]:
import plotly.express as px

delivery_time_df = gold_df\
    .filter("avg_rating IS NOT NULL")\
    .groupBy("Delivery_time")\
    .agg({"Avg_rating": "avg"})\
    .toPandas() 

delivery_time_df = delivery_time_df.sort_values("Delivery_time")

#display(delivery_time_df)

fig = px.line(delivery_time_df, x="Delivery_time", y="avg(Avg_rating)")

fig.show()

avg_rating_df = gold_df\
    .filter("avg_rating IS NOT NULL")\
    .groupBy("Avg_rating")\
    .agg({"Delivery_time": "avg"})\
    .toPandas() 
avg_rating_df = avg_rating_df.sort_values("Avg_rating")

#display(avg_rating_df)

avg_rating_fig = px.line(avg_rating_df, x="Avg_rating", y="avg(Delivery_time)")

avg_rating_fig.show()

##Most Popular & Underserverd Cuisines
 

In [0]:
from pyspark.sql.functions import explode, split, log, ln, length, avg, row_number, col
from pyspark.sql.window import Window
import plotly.express as px
from pyspark.sql import SparkSession # Assuming 'spark' is passed in or accessible

def weighted_average(cuisines_df):
    return cuisines_df.withColumn(
        'Weighted_Rating', 
        col('Avg_rating') + ln(col('Total_ratings')) / 20
    )

def find_top_cuisine(cuisines_df, window_spec):
    return cuisines_df.filter('Avg_rating IS NOT NULL')\
        .groupBy("City", "Cuisine")\
        .agg(avg("Weighted_Rating").alias("avg_Weighted_Rating"))\
        .withColumn("rank", row_number().over(window_spec))\
        .filter("rank = 1")

def analyze_cuisines_and_visualize(gold_df):    
    # 1. Prepare Cuisines DataFrame (Explode 'CuisineList')
    cuisines_df = gold_df.withColumn("Cuisine", explode(split("CuisineList", ",")))\
                         .select("Name", "Cuisine", "City", "Avg_rating", "Total_ratings")

    # 2. Calculate Weighted Rating and Filter
    # Weighted_Rating = Avg_rating + ln(Total_ratings)/20
    weighted_cuisine_city_df = cuisines_df.transform(weighted_average).filter(
        length('Cuisine') < 16 # Filter out junk values
    )


    # 3. Find Best Cuisines Overall
    overall_best_cuisines = weighted_cuisine_city_df.filter('Avg_rating IS NOT NULL')\
        .groupBy("Cuisine")\
        .agg(avg("Weighted_Rating").alias("avg_Weighted_Rating"))\
        .orderBy(col("avg_Weighted_Rating").desc())

    overall_best_cuisines.show()
    
    
    # 4. Find Best (Top) Cuisine Per City
    window_spec_best = Window.partitionBy("City").orderBy(col("avg_Weighted_Rating").desc())
    
    top_cuisines_df = weighted_cuisine_city_df.transform(find_top_cuisine, window_spec_best)
    
    
    # 5. Find Underserved (Low) Cuisine Per City
    window_spec_low = Window.partitionBy("City").orderBy(col("avg_Weighted_Rating").asc())
    
    low_cuisines_df = weighted_cuisine_city_df.transform(find_top_cuisine, window_spec_low)

    # 6. Prepare City Coordinates (Hardcoded for Indian Cities)
    city_coords_data = [
        ("Mumbai", 19.0760, 72.8777), ("Delhi", 28.7041, 77.1025),
        ("Bengaluru", 12.9716, 77.5946), ("Chennai", 13.0827, 80.2707),
        ("Kolkata", 22.5726, 88.3639), ("Hyderabad", 17.3850, 78.4867),
        ("Ahmedabad", 23.0225, 72.5714), ("Pune", 18.5204, 73.8567),
        ("Jaipur", 26.9124, 75.7873), ("Surat", 21.1702, 72.8311)
    ]
    city_coords = spark.createDataFrame(city_coords_data, ["City", "Latitude", "Longitude"])

    # 7. Visualization for Best Cuisines Per City
    map_df = top_cuisines_df.join(city_coords, on="City", how="inner")
    map_pd = map_df.select("City", "Cuisine", "avg_Weighted_Rating", "Latitude", "Longitude").toPandas()

    fig_best = px.scatter_mapbox(
        map_pd, lat="Latitude", lon="Longitude", color="avg_Weighted_Rating",
        hover_name="City", hover_data=["Cuisine", "avg_Weighted_Rating"],
        size="avg_Weighted_Rating", zoom=3, mapbox_style="carto-positron", title="Best Rated Cuisine in each region"
    )

    fig_best.show() 

    # 8. Visualization for Underserverd Cuisines Per City
    low_map_df = low_cuisines_df.join(city_coords, on="City", how="inner")
    low_map_pd = low_map_df.select("City", "Cuisine", "avg_Weighted_Rating", "Latitude", "Longitude").toPandas()

    fig_low = px.scatter_mapbox(
        low_map_pd, lat="Latitude", lon="Longitude", color="avg_Weighted_Rating",
        hover_name="City", hover_data=["Cuisine", "avg_Weighted_Rating"],
        size="avg_Weighted_Rating", zoom=3, mapbox_style="carto-positron", title="Least Rated Cuisine in each region"
    )

    fig_low.show() 


analyze_cuisines_and_visualize(gold_df)