In [0]:
%pip uninstall -y databricks_helpers 
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers 

In [0]:
exercise_name = "final_day_presentation"

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers

helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

current_user = helpers.current_user()
working_directory = helpers.working_directory()

print(f"Your current working directory is: {working_directory}")

In [0]:
# Read raw file from Gold

gold_layer_path = working_directory + "/silver"

gold_df = spark\
    .read\
    .parquet(gold_layer_path)

print(f"Schema of the raw DataFrame:")
gold_df.printSchema()
display(gold_df)

### To determine Top 5 restaurants among all the cities.

In [0]:
from pyspark.sql.functions import col, when, row_number
from pyspark.sql import Window
from pyspark.sql import DataFrame

#Display Top five restaurants
def top_five_res(gold_df: DataFrame) -> DataFrame:

    window_spec = Window.partitionBy("City") \
                    .orderBy(col("Avg_rating").desc(), col("Total_ratings").desc()) \

    ranked_restaurants_df = gold_df.withColumn("rank", row_number().over(window_spec))\
                            .select("Name", "Area", "City", "Avg_rating", "Total_ratings", "Cuisine", "Cost_for_two", "Vegetarian")
                            

    top_5_per_city = ranked_restaurants_df.filter(col("rank") <= 5)
    return top_5_per_city


top_five_restaurants = top_five_res(gold_df)
display(top_five_restaurants)


In [0]:
import plotly.express as px
import pandas as pd
from pyspark.sql import DataFrame 

def display_top_five(top_five_restaurants: DataFrame) -> None:
    # 1. Convert your PySpark DataFrame to a Pandas DataFrame
    # This part assumes 'top_five_restaurants' is your correctly filtered PySpark DataFrame
    top_restaurants_pd = top_five_restaurants.toPandas()

    # 2. Create the Vertical Faceted Dot Plot with text labels
    fig = px.scatter(top_restaurants_pd,
                 x="Avg_rating",
                 y="Name",
                 facet_row="City",
                 color="Avg_rating",
                 hover_data=["Total_ratings", "Cuisine", "Cost_for_two"],
                 color_continuous_scale='Plasma',
                 facet_row_spacing=0.03,
                 text="Avg_rating"
                )
    # --- Style Customizations ---
    # Style the text labels and the markers
    fig.update_traces(
        marker=dict(size=16, line=dict(width=2, color='DarkSlateGrey')),
        textfont=dict(family="Arial", size=12, color='DarkSlateGrey'),
        textposition='middle right'
    )


    # Adjust graph size for a vertical layout and increase font sizes
    fig.update_layout(
        title="<b>Top 5 Restaurant Ratings by City</b>",
        title_font=dict(size=28, family="Arial, bold"),
        width=1000,
        height=2000,
        xaxis_title="<b>Average Rating</b>",
        template="plotly_white",
        yaxis=dict(tickfont=dict(size=14)),
        xaxis=dict(tickfont=dict(size=12))
    )

    # --- THIS IS THE CORRECTED LOGIC ---
    # Set the y-axis titles of each subplot to be the correct city name

    # First, ensure the y-axes can have their own titles and labels
    fig.update_yaxes(matches=None, showticklabels=True)

    # Loop through the annotations Plotly creates for each subplot
    for i, annotation in enumerate(fig.layout.annotations):
        # Get the city name from the annotation text (e.g., from "City=Delhi")
        city_name = annotation.text.split('=')[-1]
        
        # Set the y-axis title for the corresponding subplot
        # The yaxis numbering starts from 1
        fig.layout[f'yaxis{i+1}'].title = f'<b>{city_name}</b>'
        fig.layout[f'yaxis{i+1}'].title.font = dict(size=18)
        
        # Clear the original, right-aligned annotation text
        annotation.text = ""

    fig.show()

display_top_five(top_five_restaurants)


In [0]:
output_dir = working_directory + "/gold"

top_five_restaurants.write.mode("overwrite").parquet(output_dir)

### To build a profile of what a "successful" restaurant looks like, guiding new business strategies.

In [0]:
from pyspark.sql.functions import col, when, count

# Create the 'success_score' column using a series of conditions.
def success_score(gold_df: DataFrame) -> DataFrame:
    success_profile_df = gold_df.withColumn("Success_Score",
        when(
            (col("Avg_rating") >= 4.5) &
            (col("Total_ratings") >= 500) &
            (col("Delivery_time") < 30),
            "Effective"
        ).when(
            (col("Avg_rating") >= 4.0) & (col("Avg_rating") < 4.5) &
            (col("Total_ratings") >= 100) & (col("Total_ratings") < 500) &
            (col("Delivery_time") >= 30) & (col("Delivery_time") < 40),
            "Efficient"
        ).otherwise("Relevant")
    )
    return success_profile_df

success_profile_df = success_score(gold_df).select("Name", "Area", "City", "Success_Score") \
                     .withColumnRenamed("Success_Score", "Success Score")
success_profile_df.groupBy("Success Score") \
    .count() \
    .withColumnRenamed("count", "Restaurant Count") \
    .show()

success_profile_df.printSchema()
display(success_profile_df)


In [0]:
import plotly.express as px
import pandas as pd

from pyspark.sql import DataFrame 

def display_success_score(top_five_restaurants: DataFrame) -> None:

    # 1. Convert the PySpark DataFrame to Pandas
    success_pd = success_profile_df.toPandas()

    # 2. Create a cross-tabulation to get the counts of each score per city
    city_counts = pd.crosstab(success_pd['City'], success_pd['Success Score'])

    # 3. Normalize the counts by row to get the percentage (ratio) for each city
    city_ratios = city_counts.div(city_counts.sum(axis=1), axis=0) * 100

    # 4. Create the Heatmap
    fig = px.imshow(city_ratios,
                    text_auto='.0f',  # Automatically format text labels as integers
                    aspect="auto",
                    labels=dict(x="Success Score", y="City", color="Percentage (%)"),
                    color_continuous_scale=px.colors.sequential.GnBu, # A vibrant, modern color scale
                    title="<b>Heatmap of Success Score Ratios by City</b>"
                )

    # --- Style Customizations ---
    fig.update_layout(
        title_font=dict(size=24, family="Arial, bold"),
        xaxis_title="<b>Success Score Category</b>",
        yaxis_title="<b>City</b>"
    )
    fig.update_xaxes(side="top") # Move x-axis labels to the top for a cleaner look

    fig.show()
    
display_success_score(top_five_restaurants)



In [0]:
output_dir = working_directory + "/gold"

success_profile_df.write.mode("overwrite").parquet(output_dir)