### Setting Up the Notebook

In [0]:
%pip uninstall -y databricks_helpers 
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers 

In [0]:
exercise_name = "final_day_presentation"   

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers

helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

current_user = helpers.current_user()
working_directory = helpers.working_directory()

print(f"Your current working directory is: {working_directory}")
     

In [0]:
# Read raw file from Bronze

silver_layer_path = working_directory + "/silver"

gold_df = spark\
    .read\
    .parquet(silver_layer_path)

print(f"Schema of the raw DataFrame:")
gold_df.printSchema()
display(gold_df.limit(100))

## Questions:
### 1. How do restaurant ratings and popularity (number of ratings) vary across different cities and localities?

In [0]:
from pyspark.sql.functions import regexp_replace, col, avg, sum, count, round, min, max
import plotly.express as px

def analyze_and_plot_city_performance(input_df):

    # --- Calculations (Aggregation) ---
    print(" Performing city-level aggregation...")
    city_analysis = input_df.groupBy("City").agg(
        round(avg("Avg_rating"), 2).alias("avg_city_rating"),
        round(avg("Total_ratings")).alias("avg_city_popularity"),
        sum("Total_ratings").alias("total_city_popularity"),
        count("*").alias("total_restaurants_in_city")
    )

    # ---  Display Results in Notebook ---
    city_analysis_by_rating = city_analysis.orderBy(col("avg_city_rating").desc())
    city_analysis_by_popularity = city_analysis.orderBy(col("avg_city_popularity").desc())

     # Find the min and max for each column to be used in the score
    min_max_values = city_analysis.agg(
        min("avg_city_rating").alias("min_rating"),
        max("avg_city_rating").alias("max_rating"),
        min("avg_city_popularity").alias("min_popularity"),
        max("avg_city_popularity").alias("max_popularity")
    ).first()

    min_rating = min_max_values["min_rating"]
    max_rating = min_max_values["max_rating"]
    min_popularity = min_max_values["min_popularity"]
    max_popularity = min_max_values["max_popularity"]

    # Normalize the columns to a 0-1 scale
    df_normalized = city_analysis.withColumn("rating_score",round((col("avg_city_rating") - min_rating) / (max_rating - min_rating), 3)).withColumn(
        "popularity_score",round((col("avg_city_popularity") - min_popularity) / (max_popularity - min_popularity), 3))

    # Define weights and calculate the final score
    # Let's say rating is slightly more important than popularity
    weight_rating = 0.6
    weight_popularity = 0.4

    df_with_score = df_normalized.withColumn("city_score",round((col("rating_score") * weight_rating) + (col("popularity_score") * weight_popularity), 3))

    # --- Display the final results ---
    df_with_score.select("city", "avg_city_rating", "avg_city_popularity","rating_score", "popularity_score", "city_score", "total_restaurants_in_city").orderBy(col("city_score").desc()).show()

    # --- Plotting ---
    # Convert the aggregated Spark DataFrame to a Pandas DataFrame for plotting
    pandas_df = df_with_score.toPandas()

    # Sort cities by the overall score for a more insightful plot
    pandas_df_sorted = pandas_df.sort_values("city_score", ascending=False)

    # Reshape the data from wide to long format for Plotly Express
    df_melted = pandas_df.melt(
        id_vars=['City', 'city_score'],
        value_vars=['rating_score', 'popularity_score'],
        var_name='metric_type',
        value_name='normalized_score'
    )

    # Create the grouped bar chart
    fig = px.bar(
        df_melted,
        x="City",
        y="normalized_score",
        color="metric_type",  # Creates the groups for rating vs. popularity
        barmode="group",
        title="Rating vs. Popularity Score by City",
        labels={
            "normalized_score": " Score (0 to 1)",
            "City": "City",
            "metric_type": "Metric"
        },
        height=600  # Adjust height for better readability
    )

    fig.show()

    return city_analysis_by_rating, city_analysis_by_popularity, df_with_score

city_restaurant_analysis = analyze_and_plot_city_performance(gold_df)
display(city_restaurant_analysis)


### 5.What is the market share and performance of vegetarian vs. non-vegetarian restaurants in key areas?

In [0]:
from pyspark.sql.functions import col, count, sum, round, avg, when
from pyspark.sql.window import Window
import plotly.express as px

def analyze_veg_nonveg_performance(input_df):

    # Filter out rows where the 'Vegetarian' column is null
    df_filtered = input_df.filter(col("Vegetarian").isNotNull())

    df_filtered = df_filtered.withColumn("Vegetarian",when(col("Vegetarian") == True, "Veg").otherwise("Non-Veg"))

    # --- 1. Market Share Calculation ---
    veg_counts_city = df_filtered.groupBy("City", "Vegetarian").agg(count("*").alias("restaurant_count"))
    city_window = Window.partitionBy("City")
    city_market_share_df = veg_counts_city.withColumn("total_restaurants_in_city",sum("restaurant_count").over(city_window)).withColumn(
        "market_share_pct",round((col("restaurant_count") / col("total_restaurants_in_city")) * 100, 2))

    # --- 2. Performance Calculation ---
    city_performance_df = df_filtered.groupBy("City", "Vegetarian").agg(
        round(avg("Avg_rating"), 2).alias("avg_rating"),
        round(avg("Total_ratings")).alias("avg_popularity_score")
    )

    # First, convert the Spark DataFrame to a Pandas DataFrame
    pandas_market_share = city_market_share_df.toPandas()
    pandas_restaurant_performance = city_performance_df.toPandas()

    # Create the grouped bar chart
    market_share = px.bar(
        pandas_market_share,
        x="City",
        y="market_share_pct",
        color="Vegetarian",  # This creates the "Veg" vs "Non-Veg" groups
        barmode="group",
        title="Market Share of Veg vs. Non-Veg Restaurants by City",
        labels={"market_share_pct": "Market Share (%)", "City": "City", "Vegetarian": "Restaurant Type"}
    )

    market_share.show()

    # First, convert the performance Spark DataFrame to a Pandas DataFrame
    pandas_performance = performance_results.toPandas()

    # Create the grouped bar chart for average rating
    restaurant_performance = px.bar(
        pandas_restaurant_performance,
        x="City",
        y="avg_rating",
        color="Vegetarian",
        barmode="group",
        title="Performance (Average Rating) of Veg vs. Non-Veg Restaurants by City",
        labels={"avg_rating": "Average Rating", "City": "City", "Vegetarian": "Restaurant Type"},
        color_discrete_sequence=px.colors.qualitative.Pastel, # Or D3, Plotly, Light, T10
        template="plotly_white"
    )

    restaurant_performance.show()

    return city_market_share_df, city_performance_df

# --- How to use the function ---
# Assuming 'swiggy_df_transformed' is your cleaned input DataFrame
market_share_results, performance_results = analyze_veg_nonveg_performance(gold_df)
