In [4]:

import pandas as pd
import altair as alt
import logging
from pathlib import Path
from typing import Union
import requests
from sklearn.linear_model import LinearRegression
from typing import List, Tuple



In [None]:

# --- Logging Configuration ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)

# --- Config Parameters  ---
PROJECT_ROOT: Path = Path.cwd()  
DATA_PATH: Path = PROJECT_ROOT.parent / "data" / "the_rise_of_healthcare_jobs_disclosed_data_by_msa.csv"

START_ROW_INDEX: int = 33       # skip  rows containing states 
TOP_N_MSA: int = 10             # top 10 MSAs for visualization

logging.info(f"Project root set to: {PROJECT_ROOT}")


2025-11-11 19:43:16,676 [INFO] Project root set to: e:\StudyTime\Fall\DSI\2025-autumn-bfi\notebooks


In [7]:
def read_data(file_path: Union[str, Path]) -> pd.DataFrame:
    """
    Reads the CSV dataset.
    Returns: Pandas DataFrame
    """
    try:
        df: pd.DataFrame = pd.read_csv(file_path)
        logging.info(f"Data successfully read from {file_path}. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error while reading file: {e}")
        raise


def clean_data(df: pd.DataFrame, start_row: int) -> pd.DataFrame:
    """
    Cleans raw dataframe by removing metadata or notes rows.
    Returns: Cleaned DataFrame
    """
    cleaned_df: pd.DataFrame = df.iloc[start_row:].reset_index(drop=True)
    logging.info(f"Cleaned data (dropped first {start_row} rows). New shape: {cleaned_df.shape}")
    return cleaned_df


def plot_top_msa_healthcare_share(df: pd.DataFrame, top_n: int = 10) -> alt.Chart:
    """
    Creates a bar chart of top N MSAs by Healthcare Employment Share (2022).
    Returns: Altair Chart
    """
    top_df: pd.DataFrame = df.nlargest(top_n, 'healthcare_share_prime2022').copy()

    chart: alt.Chart = (
        alt.Chart(top_df)
        .mark_bar(cornerRadiusTopLeft=3, cornerRadiusTopRight=3)
        .encode(
            y=alt.Y(
                'metro_title:N',
                sort='-x',
                axis=alt.Axis(labelFontSize=13, labelLimit=350, title=None)
            ),
            x=alt.X(
                'healthcare_share_prime2022:Q',
                title='Healthcare Employment Share (2022)',
                axis=alt.Axis(format='.0%', labelFontSize=12, titleFontSize=14, grid=False)
            ),
            color=alt.Color(
                'healthcare_share_prime2022:Q',
                scale=alt.Scale(range=['#ecd2c2', '#a05252', '#800000']),
                legend=None
            ),
            tooltip=['metro_title', alt.Tooltip('healthcare_share_prime2022:Q', format='.2%')]
        )
        .properties(
            title=f'Top {top_n} MSAs by Healthcare Employment Share (2022)',
            width=700,
            height=450
        )
        .configure_title(fontSize=18, font='Lato', anchor='start')
        .configure_axis(labelFont='Lato', titleFont='Lato', grid=False)
        .configure_view(strokeWidth=0)
    )

    logging.info(f"Generated chart for top {top_n} MSAs.")
    return chart


In [None]:
# --- Read and Clean ---
data_raw: pd.DataFrame = read_data(DATA_PATH)
data_clean: pd.DataFrame = clean_data(data_raw, START_ROW_INDEX)

# --- Plot ---
chart: alt.Chart = plot_top_msa_healthcare_share(data_clean, TOP_N_MSA)
chart.display()  


2025-11-11 19:43:22,549 [INFO] Data successfully read from e:\StudyTime\Fall\DSI\2025-autumn-bfi\data\the_rise_of_healthcare_jobs_disclosed_data_by_msa.csv. Shape: (130, 26)
2025-11-11 19:43:22,552 [INFO] Cleaned data (dropped first 33 rows). New shape: (97, 26)
2025-11-11 19:43:22,594 [INFO] Generated chart for top 10 MSAs.


In [None]:

# ------------------------------------------------------
# Configuration & Logging
# ------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Hyperparameters (replace these with imports from config.py later)
TOP_N_MSA: int = 10
COLOR_STEM: str = "#a05252"
COLOR_DOT: str = "#800000"
DATA_COLUMN: str = "hc_emp_share_prime_change"
LABEL_COLUMN: str = "metro_title"
ZERO_BASE_COL: str = "zero"
CHART_WIDTH: int = 720
CHART_HEIGHT: int = 450

# ------------------------------------------------------
# Function Definition
# ------------------------------------------------------
def plot_lollipop_healthcare_growth(df: pd.DataFrame, top_n: int = TOP_N_MSA) -> alt.Chart:
    """
    Creates a lollipop chart of the top N MSAs with the largest increase
    in healthcare employment share between 1980–2022.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing at least 'metro_title' and 'hc_emp_share_prime_change' columns.
    top_n : int, optional
        Number of MSAs to display, by default TOP_N_MSA

    Returns
    -------
    alt.Chart
        Altair chart object
    """

    # Data preparation
    top_df: pd.DataFrame = df.nlargest(top_n, DATA_COLUMN).copy()
    top_df[ZERO_BASE_COL] = 0  # baseline for stems
    logging.info(f"Selected top {top_n} MSAs for healthcare employment share change visualization.")

    # Base chart (shared encoding)
    base = alt.Chart(top_df).encode(
        x=alt.X(
            f"{LABEL_COLUMN}:N",
            sort=f"-y",
            axis=alt.Axis(
                labelAngle=-30,
                labelFontSize=11,
                labelLimit=250,
                title=None
            )
        ),
        y=alt.Y(
            f"{DATA_COLUMN}:Q",
            title="Increase in Healthcare Employment Share (1980–2022)",
            axis=alt.Axis(format=".1%", labelFontSize=11, titleFontSize=13, grid=False)
        )
    )

    # 3️⃣ Lollipop stems (baseline → value)
    stems = base.mark_rule(stroke=COLOR_STEM, strokeWidth=2).encode(
        y=f"{ZERO_BASE_COL}:Q",
        y2=f"{DATA_COLUMN}:Q"
    )

    # 4️⃣ Circle heads (values)
    dots = base.mark_circle(size=130, color=COLOR_DOT).encode(
        tooltip=[
            alt.Tooltip(f"{LABEL_COLUMN}:N", title="MSA"),
            alt.Tooltip(f"{DATA_COLUMN}:Q", title="Change (%)", format=".2%")
        ]
    )

    # 5️⃣ Combine layers and style
    chart = (
        (stems + dots)
        .properties(
            title=f"Top {top_n} MSAs with the Largest Increase in Healthcare Employment Share (1980–2022)",
            width=CHART_WIDTH,
            height=CHART_HEIGHT
        )
        .configure_title(fontSize=18, font="Lato", anchor="start")
        .configure_axis(labelFont="Lato", titleFont="Lato", grid=False)
        .configure_view(strokeWidth=0)
    )

    logging.info("Lollipop chart created successfully.")
    return chart



chart_lollipop = plot_lollipop_healthcare_growth(data_clean, TOP_N_MSA)
chart_lollipop.display()


2025-11-11 19:48:37,135 [INFO] Selected top 10 MSAs for healthcare employment share change visualization.


2025-11-11 19:48:37,185 [INFO] Lollipop chart created successfully.


In [None]:

# ------------------------------------------------------
# Config
# ------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

COLOR_SCATTER = "#800000"
COLOR_REGRESSION = "#2c3e50"
POINT_SIZE = 80
ALPHA = 0.75
CHART_WIDTH = 720
CHART_HEIGHT = 500

X_COL = "manu_share_prime_change"
Y_COL = "hc_emp_share_prime_change"
LABEL_COL = "metro_title"

# Keywords to highlight (partial matches allowed)
HIGHLIGHT_KEYWORDS = ["Pittsburgh", "Cleveland", "Rochester", "Boston", "Detroit"]

# ------------------------------------------------------
def plot_healthcare_vs_manufacturing(df: pd.DataFrame) -> alt.Chart:
    """Scatter plot with regression line and labeled MSAs (partial match, no stroke)."""

    #  Find matching rows for any city that contains one of the keywords
    highlight_mask = df[LABEL_COL].apply(
        lambda x: any(k.lower() in str(x).lower() for k in HIGHLIGHT_KEYWORDS)
    )
    highlight_df = df[highlight_mask].copy()
    logging.info(f"Highlighting {len(highlight_df)} MSAs: {highlight_df[LABEL_COL].tolist()}")

    # 1 Scatter points
    scatter = (
        alt.Chart(df)
        .mark_circle(size=POINT_SIZE, color=COLOR_SCATTER, opacity=ALPHA)
        .encode(
            x=alt.X(
                f"{X_COL}:Q",
                title="Change in Manufacturing Employment Share (1980–2022)",
                axis=alt.Axis(format=".1%", labelFontSize=11, titleFontSize=13),
            ),
            y=alt.Y(
                f"{Y_COL}:Q",
                title="Change in Healthcare Employment Share (1980–2022)",
                axis=alt.Axis(format=".1%", labelFontSize=11, titleFontSize=13),
            ),
            tooltip=[
                alt.Tooltip(f"{LABEL_COL}:N", title="MSA"),
                alt.Tooltip(f"{Y_COL}:Q", title="Healthcare Change (%)", format=".2%"),
                alt.Tooltip(f"{X_COL}:Q", title="Manufacturing Change (%)", format=".2%"),
            ],
        )
    )

    #  Regression line
    regression = (
        alt.Chart(df)
        .transform_regression(X_COL, Y_COL, method="linear", as_=["x", "y"])
        .mark_line(color=COLOR_REGRESSION, strokeWidth=2)
        .encode(x="x:Q", y="y:Q")
    )

    #  Compute regression stats
    try:
        x = df[[X_COL]].dropna()
        y = df[Y_COL].dropna()
        reg = LinearRegression().fit(x, y.loc[x.index])
        r2 = reg.score(x, y.loc[x.index])
        slope = reg.coef_[0]
        stats_label = f"Slope = {slope:.2f}, R² = {r2:.2f}"
    except Exception as e:
        stats_label = "Regression unavailable"
        logging.warning(f"Could not compute regression stats: {e}")

    stats_text = (
        alt.Chart(pd.DataFrame({"text": [stats_label]}))
        .mark_text(align="left", x=10, y=15, fontSize=12, font="Lato", color="black")
        .encode(text="text:N")
    )

    # 4️⃣ Highlighted MSA labels (no stroke)
    labels = (
        alt.Chart(highlight_df)
        .mark_text(
            align="left",
            dx=7, dy=-5,
            fontSize=10,
            font="Lato",
            color="#2c3e50"
        )
        .encode(
            x=f"{X_COL}:Q",
            y=f"{Y_COL}:Q",
            text=f"{LABEL_COL}:N",
            tooltip=[
                alt.Tooltip(f"{LABEL_COL}:N", title="MSA"),
                alt.Tooltip(f"{Y_COL}:Q", title="Healthcare Change (%)", format=".2%"),
                alt.Tooltip(f"{X_COL}:Q", title="Manufacturing Change (%)", format=".2%"),
            ],
        )
    )

    #  Combine all layers
    chart = (
        (scatter + regression + labels + stats_text)
        .properties(
            title="Healthcare Employment Growth vs Manufacturing Employment Decline (1980–2022)",
            width=CHART_WIDTH,
            height=CHART_HEIGHT,
        )
        .configure_title(font="Lato", fontSize=18, anchor="start")
        .configure_axis(labelFont="Lato", titleFont="Lato", grid=True)
        .configure_view(strokeWidth=0)
    )

    return chart


# ------------------------------------------------------
# Example usage in Jupyter:
# ------------------------------------------------------
chart_scatter = plot_healthcare_vs_manufacturing(data_clean)
chart_scatter.display()


2025-11-11 20:26:57,914 [INFO] Highlighting 5 MSAs: ['Boston-Cambridge-Newton, MA-NH', 'Cleveland-Elyria, OH', 'Detroit-Warren-Dearborn, MI', 'Pittsburgh, PA', 'Rochester, NY']


In [None]:

# ------------------------------------------------------
# Configuration & Logging
# ------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Hyperparameters
COLOR_SCATTER = "#004B87"       # deep UChicago blue
COLOR_REGRESSION = "#FF6B35"    # contrasting orange line
ALPHA = 0.7
CHART_WIDTH = 720
CHART_HEIGHT = 500
POINT_SIZE_RANGE = [30, 900]    # scale for population bubbles

X_COL = "change_earnings"
Y_COL = "hc_emp_share_prime_change"
SIZE_COL = "ln_msa_pop2022"
LABEL_COL = "metro_title"

# ------------------------------------------------------
# Function Definition
# ------------------------------------------------------
def plot_earnings_vs_healthcare(df: pd.DataFrame) -> alt.Chart:
    """
    Creates a bubble scatterplot showing the relationship between earnings growth
    and healthcare employment growth across MSAs, with bubble size = population (2022).
    Adds a linear regression trendline and displays regression statistics (slope, R²).

    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns:
        - 'change_earnings'
        - 'hc_emp_share_prime_change'
        - 'ln_msa_pop2022'
        - 'metro_title'

    Returns
    -------
    alt.Chart
        Altair chart object (scatter + regression + stats text)
    """

    logging.info("Starting bubble scatterplot creation: earnings vs healthcare employment growth.")

    #  Scatter layer (bubbles)
    scatter = (
        alt.Chart(df)
        .mark_circle(opacity=ALPHA)
        .encode(
            x=alt.X(
                f"{X_COL}:Q",
                title="Earnings Growth (1980–2022)",
                axis=alt.Axis(format=".1%", labelFontSize=11, titleFontSize=13)
            ),
            y=alt.Y(
                f"{Y_COL}:Q",
                title="Healthcare Employment Growth (1980–2022)",
                axis=alt.Axis(format=".1%", labelFontSize=11, titleFontSize=13)
            ),
            size=alt.Size(
                f"{SIZE_COL}:Q",
                title="Population (ln, 2022)",
                scale=alt.Scale(range=POINT_SIZE_RANGE)
            ),
            color=alt.value(COLOR_SCATTER),
            tooltip=[
                alt.Tooltip(f"{LABEL_COL}:N", title="MSA"),
                alt.Tooltip(f"{X_COL}:Q", title="Earnings Growth (%)", format=".2%"),
                alt.Tooltip(f"{Y_COL}:Q", title="Healthcare Growth (%)", format=".2%"),
                alt.Tooltip(f"{SIZE_COL}:Q", title="Log(Population 2022)")
            ]
        )
    )

    #  Regression line
    regression = (
        alt.Chart(df)
        .transform_regression(X_COL, Y_COL, method="linear", as_=["x", "y"])
        .mark_line(color=COLOR_REGRESSION, strokeWidth=2)
        .encode(x="x:Q", y="y:Q")
    )

    #  Compute regression statistics
    try:
        x = df[[X_COL]].dropna()
        y = df[Y_COL].dropna()
        reg = LinearRegression().fit(x, y.loc[x.index])
        r2 = reg.score(x, y.loc[x.index])
        slope = reg.coef_[0]
        stats_label = f"Slope = {slope:.2f}, R² = {r2:.2f}"
        logging.info(f"Regression computed: {stats_label}")
    except Exception as e:
        stats_label = "Regression unavailable"
        logging.warning(f"Could not compute regression stats: {e}")

    stats_text = (
        alt.Chart(pd.DataFrame({"text": [stats_label]}))
        .mark_text(align="left", x=10, y=15, fontSize=12, font="Lato", color="black")
        .encode(text="text:N")
    )

    # 4️⃣ Combine all layers
    chart = (
        (scatter + regression + stats_text)
        .properties(
            title="Earnings Growth vs Healthcare Employment Growth\n(Bubble Size = Population, 1980–2022)",
            width=CHART_WIDTH,
            height=CHART_HEIGHT
        )
        .configure_title(font="Lato", fontSize=18, anchor="start")
        .configure_axis(labelFont="Lato", titleFont="Lato", grid=True)
        .configure_view(strokeWidth=0)
        .configure_legend(titleFont="Lato", labelFont="Lato", orient="right")
    )

    logging.info("Bubble scatterplot created successfully.")
    return chart


chart_bubble = plot_earnings_vs_healthcare(data_clean)
chart_bubble.display()


2025-11-11 20:28:30,443 [INFO] Starting bubble scatterplot creation: earnings vs healthcare employment growth.
2025-11-11 20:28:30,493 [INFO] Regression computed: Slope = -0.04, R² = 0.11
2025-11-11 20:28:30,531 [INFO] Bubble scatterplot created successfully.


In [None]:

# ------------------------------------------------------
# Configuration & Logging
# ------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

DATA_PATH: str = "../data/the_rise_of_healthcare_jobs_disclosed_data_by_msa.csv"

PALETTES: List[List[str]] = [
    ["#800000", "#C5050C", "#FF5F05"],  # UChicago tones
    ["#005C99", "#0099CC", "#66CCFF"],  # Blues
    ["#1B5E20", "#43A047", "#A5D6A7"],  # Greens
]

# Variable relationships: (x, y, x_label, y_label, palette)
RELATIONSHIPS: List[Tuple[str, str, str, str, List[str]]] = [
    ("change_ln_population", "hc_emp_share_prime_change",
     "Change in Population (log)", "Change in Healthcare Employment Share", PALETTES[0]),
    
    ("change_earnings", "hc_emp_share_prime_change",
     "Change in Earnings", "Change in Healthcare Employment Share", PALETTES[0]),
    
    ("change_college", "hc_emp_share_prime_change",
     "Change in College Share", "Change in Healthcare Employment Share", PALETTES[1]),
    
    ("manu_share_prime_change", "hc_emp_share_prime_change",
     "Change in Manufacturing Share", "Change in Healthcare Employment Share", PALETTES[1]),
    
    ("change_medicare_share", "hc_emp_share_prime_change",
     "Change in Medicare Share", "Change in Healthcare Employment Share", PALETTES[2]),
    
    ("change_non_hc_share_lbfr", "hc_emp_share_prime_change",
     "Change in Non-Healthcare Labor Force Participation Rate (LFPR)",
     "Change in Healthcare Employment Share", PALETTES[2]),
]


# ------------------------------------------------------
# Helper Function: Regression Stats
# ------------------------------------------------------
def compute_regression_stats(df: pd.DataFrame, x: str, y: str) -> str:
    """Return slope and R² as formatted string."""
    try:
        model = LinearRegression()
        x_vals = df[[x]].dropna()
        y_vals = df[y].dropna()
        model.fit(x_vals, y_vals.loc[x_vals.index])
        r2 = model.score(x_vals, y_vals.loc[x_vals.index])
        slope = model.coef_[0]
        return f"Slope={slope:.2f}, R²={r2:.2f}"
    except Exception as e:
        logging.warning(f"Could not compute regression stats for {x} vs {y}: {e}")
        return "Regression unavailable"


# ------------------------------------------------------
# Core Function: Single Scatter + Regression Chart
# ------------------------------------------------------
def make_colored_reg_chart(
    df: pd.DataFrame,
    x: str,
    y: str,
    x_label: str,
    y_label: str,
    palette: List[str]
) -> alt.Chart:
    """
    Create an Altair scatterplot with regression line and tooltip.
    """
    logging.info(f"Building chart: {y_label} vs {x_label}")

    base = alt.Chart(df).encode(
        x=alt.X(x, title=x_label, axis=alt.Axis(labelFontSize=11, titleFontSize=12)),
        y=alt.Y(y, title=y_label, axis=alt.Axis(labelFontSize=11, titleFontSize=12))
    )

    points = base.mark_circle(size=80, opacity=0.7, color=palette[0]).encode(
        tooltip=[
            alt.Tooltip("metro_title:N", title="MSA"),
            alt.Tooltip(x, title=x_label, format=".2%"),
            alt.Tooltip(y, title=y_label, format=".2%")
        ]
    )

    regression = base.transform_regression(x, y, method="linear").mark_line(
        color=palette[1], size=2.5
    )

    stats_label = compute_regression_stats(df, x, y)
    stats_text = (
        alt.Chart(pd.DataFrame({"text": [stats_label]}))
        .mark_text(align="left", x=10, y=15, fontSize=10, font="Lato", color="black")
        .encode(text="text:N")
    )

    # ⚙️ Keep chart clean — no .configure_*() inside
    chart = (points + regression + stats_text).properties(
        width=340,
        height=300,
        title=f"{y_label} vs {x_label}"
    )
    return chart


# ------------------------------------------------------
# Dashboard Builder
# ------------------------------------------------------
def build_dashboard(df: pd.DataFrame, relationships: List[Tuple]) -> alt.VConcatChart:
    """
    Create a 2x3 grid of scatterplots and apply unified configuration.
    """
    charts = [make_colored_reg_chart(df, *r) for r in relationships]

    # Arrange charts into grid
    row1 = charts[0] | charts[1] | charts[2]
    row2 = charts[3] | charts[4] | charts[5]
    dashboard = (row1 & row2).properties(
        title="MSA-Level Economic Relationships (1980–2022)"
    )


    dashboard = (
        dashboard.configure_title(fontSize=18, font="Lato", anchor="middle")
        .configure_axis(labelFont="Lato", titleFont="Lato", grid=True)
        .configure_view(strokeWidth=0)
    )
    logging.info("Dashboard successfully created.")
    return dashboard



df = pd.read_csv(DATA_PATH)
final_dashboard = build_dashboard(df, RELATIONSHIPS)
final_dashboard.display()


2025-11-13 23:16:54,334 [INFO] Building chart: Change in Healthcare Employment Share vs Change in Population (log)
2025-11-13 23:16:54,436 [INFO] Building chart: Change in Healthcare Employment Share vs Change in Earnings
2025-11-13 23:16:54,502 [INFO] Building chart: Change in Healthcare Employment Share vs Change in College Share
2025-11-13 23:16:54,529 [INFO] Building chart: Change in Healthcare Employment Share vs Change in Manufacturing Share
2025-11-13 23:16:54,579 [INFO] Building chart: Change in Healthcare Employment Share vs Change in Medicare Share
2025-11-13 23:16:54,621 [INFO] Building chart: Change in Healthcare Employment Share vs Change in Non-Healthcare Labor Force Participation Rate (LFPR)
2025-11-13 23:16:55,046 [INFO] Dashboard successfully created.


In [None]:
import pandas as pd
import altair as alt
from typing import Dict

# ==============================================================
# 1️⃣ Configuration
# ==============================================================

DATA_PATH = "../data/merged_healthcare_jobs_with_gdp.csv"

# UChicago-inspired color palette
COLOR_PALETTE: Dict[str, str] = {
    "healthcare": "#800000",      # maroon
    "population": "#005C99",      # deep blue
    "earnings": "#FF5F05",        # orange
    "college": "#1B5E20",         # green
    "manufacturing": "#9C27B0",   # purple
}

# ==============================================================
# 2️⃣ Helper Function: Create Scatterplot with Regression
# ==============================================================

def make_scatter_chart(
    data: pd.DataFrame,
    x: str,
    y: str,
    x_label: str,
    y_label: str,
    color: str
) -> alt.Chart:
    """
    Create a scatterplot with a linear regression line overlay.

    Parameters
    ----------
    data : pd.DataFrame
        Input dataset containing x and y variables.
    x : str
        Column name for the x-axis.
    y : str
        Column name for the y-axis.
    x_label : str
        Label for the x-axis.
    y_label : str
        Label for the y-axis.
    color : str
        Hex color for the scatter points.

    Returns
    -------
    alt.Chart
        A layered Altair chart (scatter + regression).
    """
    base = alt.Chart(data).encode(
        x=alt.X(
            x,
            title=x_label,
            axis=alt.Axis(labelFontSize=11, titleFontSize=12, grid=False)
        ),
        y=alt.Y(
            y,
            title=y_label,
            axis=alt.Axis(labelFontSize=11, titleFontSize=12, grid=False)
        )
    )

    scatter = base.mark_circle(size=80, opacity=0.7, color=color).encode(
        tooltip=[
            alt.Tooltip("metro_title:N", title="MSA"),
            alt.Tooltip(x, title=x_label, format=".2f"),
            alt.Tooltip(y, title=y_label, format=".2f")
        ]
    )

    regression = base.transform_regression(x, y).mark_line(
        color="black",
        strokeWidth=2
    )

    chart = (scatter + regression).properties(
        width=320,
        height=280,
        title=f"{y_label} vs {x_label}"
    )

    return chart


# ==============================================================
#  Helper Function: Build Dashboard
# ==============================================================

def build_gdp_relationship_dashboard(df: pd.DataFrame) -> alt.VConcatChart:
    """
    Create a 2x3 grid of scatterplots showing GDP relationships
    with key economic indicators.

    Parameters
    ----------
    df : pd.DataFrame
        Clean MSA-level dataset containing GDP and socioeconomic variables.

    Returns
    -------
    alt.VConcatChart
        Final dashboard layout combining all charts.
    """
    relationships = [
        ("gdp_growth_2021_percent", "hc_emp_share_prime_change", 
         "GDP Growth (2021, %)", "Healthcare Employment Share Change", COLOR_PALETTE["healthcare"]),
        ("gdp_growth_2021_percent", "change_ln_population",
         "GDP Growth (2021, %)", "Population Growth (log)", COLOR_PALETTE["population"]),
        ("gdp_growth_2021_percent", "change_earnings",
         "GDP Growth (2021, %)", "Earnings Change", COLOR_PALETTE["earnings"]),
        ("gdp_growth_2021_percent", "change_college",
         "GDP Growth (2021, %)", "College Share Change", COLOR_PALETTE["college"]),
        ("gdp_growth_2021_percent", "manu_share_prime_change",
         "GDP Growth (2021, %)", "Manufacturing Share Change", COLOR_PALETTE["manufacturing"])
    ]

    charts = [make_scatter_chart(df, *r) for r in relationships]

    # Grid layout (3 top, 2 bottom)
    row1 = charts[0] | charts[1] | charts[2]
    row2 = charts[3] | charts[4]

    dashboard = (row1 & row2).properties(
        title="MSA-Level Relationships: GDP Growth (2021) and Key Economic Indicators"
    )

    # Apply consistent global styling
    dashboard = (
        dashboard
        .configure_title(fontSize=18, font="Lato", anchor="middle")
        .configure_axis(labelFont="Lato", titleFont="Lato")
        .configure_view(strokeWidth=0)
    )

    return dashboard


# ==============================================================
#  Run Visualization
# ==============================================================

if __name__ == "__main__":
    # Load dataset
    df: pd.DataFrame = pd.read_csv(DATA_PATH)

    # Build dashboard
    final_chart = build_gdp_relationship_dashboard(df)

    # Display chart (for Jupyter, or export)
    final_chart.display()


