<a href="https://colab.research.google.com/github/gs1charancharan/masai/blob/main/Girja_Shankar_Assignment1_13122025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Girja Shankar_Assignment1_13122025***

***Assignment1_Q1***
Problem: Build a Streamlit app that follows Narrative Flow: Title ? KPIs ? Trend ? Breakdown. Use caching for data load and include filters for smoker and time. KPIs: total bills sum, avg tip percentage, avg party size.
Task 1: Implement cached data loader, filters in sidebar, and KPIs in a single row.
Task 2: Show a Plotly trend (avg total_bill by day) and a category breakdown (avg tip by time).

In [None]:

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# -----------------------------------
# Task 1: Cached data loader
# -----------------------------------
@st.cache_data
def load_data():
    df = px.data.tips()   # built-in dataset
    return df

df = load_data()

# -----------------------------------
# Sidebar Filters
# -----------------------------------
st.sidebar.header("Filters")

smoker_filter = st.sidebar.multiselect(
    "Select Smoker Status",
    options=df["smoker"].unique(),
    default=df["smoker"].unique()
)

time_filter = st.sidebar.multiselect(
    "Select Time",
    options=df["time"].unique(),
    default=df["time"].unique()
)

# Apply filters
filtered_df = df[
    (df["smoker"].isin(smoker_filter)) &
    (df["time"].isin(time_filter))
]

# -----------------------------------
# Title (Narrative Start)
# -----------------------------------
st.title("Restaurant Billing Analysis Dashboard")

# -----------------------------------
# KPIs (Single Row)
# -----------------------------------
total_bill_sum = filtered_df["total_bill"].sum()
avg_tip_pct = (filtered_df["tip"] / filtered_df["total_bill"]).mean() * 100
avg_party_size = filtered_df["size"].mean()

col1, col2, col3 = st.columns(3)

col1.metric("Total Bill (Sum)", f"${total_bill_sum:,.2f}")
col2.metric("Avg Tip %", f"{avg_tip_pct:.2f}%")
col3.metric("Avg Party Size", f"{avg_party_size:.2f}")

# -----------------------------------
# Task 2: Trend Chart
# Avg total_bill by day
# -----------------------------------
trend_df = (
    filtered_df
    .groupby("day", as_index=False)["total_bill"]
    .mean()
)

trend_fig = px.line(
    trend_df,
    x="day",
    y="total_bill",
    title="Average Total Bill by Day",
    markers=True
)

st.plotly_chart(trend_fig, use_container_width=True)

# -----------------------------------
# Task 2: Category Breakdown
# Avg tip by time
# -----------------------------------
breakdown_df = (
    filtered_df
    .groupby("time", as_index=False)["tip"]
    .mean()
)

breakdown_fig = px.bar(
    breakdown_df,
    x="time",
    y="tip",
    title="Average Tip by Time of Day",
    text_auto=".2f"
)

st.plotly_chart(breakdown_fig, use_container_width=True)
st.write("This is a sample Streamlit application.")


ModuleNotFoundError: No module named 'streamlit'

In [None]:
"""
import streamlit as st
import pandas as pd
import plotly.express as px

# ----------------------------------
# Page configuration
# ----------------------------------
st.set_page_config(
    page_title="Girja Shankar Restaurant Dashboard",
    layout="wide"
)

# ----------------------------------
# Cached data loader
# ----------------------------------
@st.cache_data
def load_data():
    return px.data.tips()

df = load_data()

# ----------------------------------
# Sidebar filters
# ----------------------------------
st.sidebar.header("Filters")

smoker = st.sidebar.multiselect(
    "Smoker",
    options=df["smoker"].unique(),
    default=df["smoker"].unique()
)

time = st.sidebar.multiselect(
    "Time",
    options=df["time"].unique(),
    default=df["time"].unique()
)

filtered_df = df[
    (df["smoker"].isin(smoker)) &
    (df["time"].isin(time))
]

# ----------------------------------
# Title
# ----------------------------------
st.title("Restaurant Billing Analysis")

# ----------------------------------
# KPIs
# ----------------------------------
kpi1, kpi2, kpi3 = st.columns(3)

kpi1.metric("Total Bill ($)", f"{filtered_df['total_bill'].sum():.2f}")
kpi2.metric(
    "Avg Tip (%)",
    f"{(filtered_df['tip']/filtered_df['total_bill']).mean()*100:.2f}"
)
kpi3.metric("Avg Party Size", f"{filtered_df['size'].mean():.2f}")

# ----------------------------------
# Trend chart
# ----------------------------------
trend_df = filtered_df.groupby("day", as_index=False)["total_bill"].mean()

trend_fig = px.line(
    trend_df,
    x="day",
    y="total_bill",
    title="Average Total Bill by Day",
    markers=True
)

st.plotly_chart(trend_fig, use_container_width=True)

# ----------------------------------
# Breakdown chart
# ----------------------------------
breakdown_df = filtered_df.groupby("time", as_index=False)["tip"].mean()

breakdown_fig = px.bar(
    breakdown_df,
    x="time",
    y="tip",
    title="Average Tip by Time",
    text_auto=".2f"
)

st.plotly_chart(breakdown_fig, use_container_width=True)

st.write("This is a sample Streamlit application.")

st.success("Streamlit is working!")
"""

'\nimport streamlit as st\nimport pandas as pd\nimport plotly.express as px\n\n# ----------------------------------\n# Page configuration\n# ----------------------------------\nst.set_page_config(\n    page_title="Girja Shankar Restaurant Dashboard",\n    layout="wide"\n)\n\n# ----------------------------------\n# Cached data loader\n# ----------------------------------\n@st.cache_data\ndef load_data():\n    return px.data.tips()\n\ndf = load_data()\n\n# ----------------------------------\n# Sidebar filters\n# ----------------------------------\nst.sidebar.header("Filters")\n\nsmoker = st.sidebar.multiselect(\n    "Smoker",\n    options=df["smoker"].unique(),\n    default=df["smoker"].unique()\n)\n\ntime = st.sidebar.multiselect(\n    "Time",\n    options=df["time"].unique(),\n    default=df["time"].unique()\n)\n\nfiltered_df = df[\n    (df["smoker"].isin(smoker)) &\n    (df["time"].isin(time))\n]\n\n# ----------------------------------\n# Title\n# ----------------------------------\ns

In [None]:
!python -m streamlit run "Girja_Shankar_Assignment1_13122025.py"


***Assignment Q3***
Problem: Create small multiples (facets) showing total_bill distributions by day separately for smokers and non-smokers, keep unified y-axis range, and remove redundant legends/labels.
Task 1: Use Plotly Express facet_col to create two facets (smoker).
Task 2: Set identical y-range across facets and declutter repeated labels.

In [None]:
import plotly.express as px

df = px.data.tips()

fig = px.histogram(
    df,
    x="total_bill",
    color="day",
    facet_col="smoker",
    nbins=30,
    title="Total Bill Distribution by Day (Smokers vs Non-Smokers)"
)

# Unified Y-axis across facets
fig.update_yaxes(matches="y")

# Remove repeated y-axis labels
fig.for_each_yaxis(lambda y: y.update(title_text="Count"))

# Clean layout
fig.update_layout(
    legend_title_text="Day",
    bargap=0.1
)

fig.show()


***Assignment1_Q4***
Problem: Implement a function that checks whether a given categorical color palette remains distinguishable under a simplified deuteranopia simulation. Return pass/fail for a palette used for days (Thur, Fri, Sat, Sun).
Task 1: Write a palette-check function that simulates deuteranopia (approx) and computes pairwise Lab distances; fail if any distance < threshold.
Task 2: Run the check on px.colors.qualitative.Safe subset for 4 categories.

In [None]:
import numpy as np
import itertools
import plotly.express as px

# ---------------------------------
# Convert Plotly rgb() to tuple
# ---------------------------------
def plotly_rgb_to_tuple(rgb_str):
    nums = rgb_str.strip("rgb()").split(",")
    return tuple(int(n) / 255 for n in nums)

# ---------------------------------
# Deuteranopia simulation (approx)
# ---------------------------------
def simulate_deuteranopia(rgb):
    matrix = np.array([
        [0.625, 0.375, 0.000],
        [0.700, 0.300, 0.000],
        [0.000, 0.300, 0.700]
    ])
    return np.clip(matrix @ np.array(rgb), 0, 1)

# ---------------------------------
# Simple LAB-like transform (safe)
# ---------------------------------
def rgb_to_lab_simple(rgb):
    return np.array([
        0.2126 * rgb[0] + 0.7152 * rgb[1] + 0.0722 * rgb[2],
        rgb[0] - rgb[1],
        rgb[2] - rgb[1]
    ])

# ---------------------------------
# Palette accessibility check
# ---------------------------------
def palette_accessibility_check(colors, threshold=0.15):
    lab_colors = []

    for c in colors:
        rgb = plotly_rgb_to_tuple(c)
        sim = simulate_deuteranopia(rgb)
        lab = rgb_to_lab_simple(sim)
        lab_colors.append(lab)

    for (i, a), (j, b) in itertools.combinations(enumerate(lab_colors), 2):
        d = np.linalg.norm(a - b)
        print(f"Distance {i}-{j}: {d:.3f}")
        if d < threshold:
            return "FAIL"

    return "PASS"

# ---------------------------------
# Task 2: Run check
# ---------------------------------
days = ["Thur", "Fri", "Sat", "Sun"]
safe_palette = px.colors.qualitative.Safe[:4]

print("Days:", days)
print("Colors:", safe_palette)
print("Accessibility Result:", palette_accessibility_check(safe_palette))


Days: ['Thur', 'Fri', 'Sat', 'Sun']
Colors: ['rgb(136, 204, 238)', 'rgb(204, 102, 119)', 'rgb(221, 204, 119)', 'rgb(17, 119, 51)']
Distance 0-1: 0.516
Distance 0-2: 0.591
Distance 0-3: 0.476
Distance 1-2: 0.177
Distance 1-3: 0.565
Distance 2-3: 0.729
Accessibility Result: PASS


In [None]:
***Q5***

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# Load dataset
df = px.data.tips()

# -----------------------------------
# Bin total_bill into $10 bins
# -----------------------------------
bin_width = 10
bins = np.arange(0, df["total_bill"].max() + bin_width, bin_width)

df["bill_bin"] = pd.cut(
    df["total_bill"],
    bins=bins,
    include_lowest=True
)

# Bin midpoints (for plotting)
bin_mid = df.groupby("bill_bin")["total_bill"].mean()



def bootstrap_ci(data, n_boot=1000, ci=95):
    boot_means = []
    for _ in range(n_boot):
        sample = np.random.choice(data, size=len(data), replace=True)
        boot_means.append(sample.mean())
    lower = np.percentile(boot_means, (100 - ci) / 2)
    upper = np.percentile(boot_means, 100 - (100 - ci) / 2)
    return lower, upper


# Compute mean tip and CI per bin
results = []

for b, g in df.groupby("bill_bin"):
    tips = g["tip"].values
    if len(tips) > 1:
        mean_tip = tips.mean()
        ci_low, ci_high = bootstrap_ci(tips, n_boot=1000)
        results.append({
            "bill_bin": b,
            "bin_mid": g["total_bill"].mean(),
            "mean_tip": mean_tip,
            "ci_low": ci_low,
            "ci_high": ci_high
        })

res_df = pd.DataFrame(results)



fig = go.Figure()

# CI band
fig.add_trace(go.Scatter(
    x=res_df["bin_mid"],
    y=res_df["ci_high"],
    mode="lines",
    line=dict(width=0),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=res_df["bin_mid"],
    y=res_df["ci_low"],
    mode="lines",
    fill="tonexty",
    fillcolor="rgba(0,100,200,0.2)",
    line=dict(width=0),
    name="95% CI"
))

# Mean line
fig.add_trace(go.Scatter(
    x=res_df["bin_mid"],
    y=res_df["mean_tip"],
    mode="lines+markers",
    line=dict(color="blue"),
    name="Mean Tip"
))

fig.update_layout(
    title="Mean Tip vs Total Bill (Binned, with 95% Bootstrap CI)",
    xaxis_title="Total Bill ($, bin midpoint)",
    yaxis_title="Mean Tip ($)",
    template="plotly_white"
)

fig.show()


  bin_mid = df.groupby("bill_bin")["total_bill"].mean()
  for b, g in df.groupby("bill_bin"):
