<a href="https://colab.research.google.com/github/fdmy2713-dotcom/MSc-in-Data-Science/blob/main/Farah_ADS1_Assessment_2_Clustering_and_Fitting_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Public Transport Usage & Premium Bus Services Analysis
======================================================

Datasets used:
1. monthly_ave_daily_pt_ridership.csv
   - month, mode, ridership

2. PublicTransportUtilisationAveragePublicTransportRidership.csv
   - year, mode, ridership

3. PremiumBusServicesCSV20241125.csv
   - BUS_SERVICE_NAME_TXT, OPR_DESC_TXT, ORIG_DEST_TXT, BUS_DIRCTN_TXT,
     BUS_ROUTE_SEQ_NUM, BUS_STOP_CD, BUS_STOP_DESC_TXT, RD_NAM_TXT,
     LONGTD_TXT, LATTD_TXT, OP_HR_TXT, FARE_TXT

This script creates:
- Categorical graphs (bar charts)
- Relational graphs (line plots)
- Statistical graphs (box plot, heatmap)
- K-means clustering (with elbow & silhouette plots)
- Linear regression fitting (trend over time)
"""



In [28]:
# ======================
# 1. Imports & Settings
# ======================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")
sns.set(style="whitegrid", context="talk")


In [32]:
# ======================
# 2. Data Loading
# ======================

def load_monthly_ridership(path: str) -> pd.DataFrame:
    """
    Load monthly average daily public transport ridership.
    Expected columns: month, mode, ridership
    """
    df = pd.read_csv(path)

    # Ensure expected columns exist
    df = df[["month", "mode", "ridership"]].copy()

    # Clean types
    df["month"] = pd.to_numeric(df["month"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")
    df["mode"] = df["mode"].astype(str)

    # Cleanup
    df = df.dropna(subset=["month", "mode", "ridership"])
    df = df.drop_duplicates().reset_index(drop=True)
    return df


def load_yearly_utilisation(path: str) -> pd.DataFrame:
    """
    Load yearly public transport utilisation / ridership.
    Expected columns: year, mode, ridership
    """
    df = pd.read_csv(path)

    df = df[["year", "mode", "ridership"]].copy()

    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")
    df["mode"] = df["mode"].astype(str)

    df = df.dropna(subset=["year", "mode", "ridership"])
    df = df.drop_duplicates().reset_index(drop=True)
    return df


def load_premium_bus(path: str) -> pd.DataFrame:
    """
    Load premium bus services.
    Uses service name, operator, fare, coordinates.
    """
    df = pd.read_csv(path)

    df = df[
        [
            "BUS_SERVICE_NAME_TXT",
            "OPR_DESC_TXT",
            "FARE_TXT",
            "LONGTD_TXT",
            "LATTD_TXT",
        ]
    ].copy()

    # Rename to cleaner column names
    df.rename(
        columns={
            "BUS_SERVICE_NAME_TXT": "service",
            "OPR_DESC_TXT": "operator",
            "FARE_TXT": "fare",
            "LONGTD_TXT": "longitude",
            "LATTD_TXT": "latitude",
        },
        inplace=True,
    )

    # Clean numeric fields
    df["fare"] = pd.to_numeric(df["fare"], errors="coerce")
    df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")
    df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")

    # Clean text fields
    df["service"] = df["service"].astype(str)
    df["operator"] = df["operator"].astype(str)

    # Cleanup
    df = df.dropna(subset=["service", "operator"])
    df = df.drop_duplicates().reset_index(drop=True)
    return df


In [31]:
df_monthly = load_monthly_ridership("monthly_ave_daily_pt_ridership.csv")
df_yearly = load_yearly_utilisation("PublicTransportUtilisationAveragePublicTransportRidership.csv")
df_premium = load_premium_bus("PremiumBusServicesCSV20241125.csv")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 8172: invalid start byte

In [27]:
# ======================
# 3. Categorical & Relational Plots
# ======================

def plot_bar_ridership_by_mode(monthly_df: pd.DataFrame) -> None:
    """
    Categorical graph:
    Total monthly ridership by mode (bar chart).
    """
    summary = (
        monthly_df.groupby("mode")["ridership"]
        .sum()
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(10, 6))
    sns.barplot(x=summary.index, y=summary.values)
    plt.title("Total Monthly Average Daily Ridership by Mode")
    plt.xlabel("Mode of Transport")
    plt.ylabel("Total Average Daily Ridership")
    plt.tight_layout()
    plt.show()


def plot_bar_premium_by_operator(premium_df: pd.DataFrame) -> None:
    """
    Categorical graph:
    Number of premium bus services by operator.
    """
    counts = (
        premium_df.groupby("operator")["service"]
        .nunique()
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(10, 6))
    sns.barplot(x=counts.index, y=counts.values)
    plt.title("Number of Premium Bus Services by Operator")
    plt.xlabel("Operator")
    plt.ylabel("Number of Premium Services")
    plt.tight_layout()
    plt.show()


def plot_line_monthly_trend(monthly_df: pd.DataFrame) -> None:
    """
    Relational / line graph:
    Monthly ridership trend for each mode.
    Assumes 'month' is 1-12 (aggregated over years).
    """
    # Sort by month
    tmp = monthly_df.sort_values("month").copy()

    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=tmp,
        x="month",
        y="ridership",
        hue="mode",
        marker="o",
    )
    plt.title("Monthly Average Daily Ridership Trend by Mode")
    plt.xlabel("Month")
    plt.ylabel("Average Daily Ridership")
    plt.xticks(range(1, 13))
    plt.tight_layout()
    plt.show()


def plot_line_yearly_utilisation(util_df: pd.DataFrame) -> None:
    """
    Relational / line graph:
    Yearly total public transport ridership (all modes).
    """
    yearly = (
        util_df.groupby("year")["ridership"]
        .sum()
        .reset_index()
        .sort_values("year")
    )

    plt.figure(figsize=(10, 6))
    sns.lineplot(data=yearly, x="year", y="ridership", marker="o")
    plt.title("Yearly Total Public Transport Ridership (All Modes)")
    plt.xlabel("Year")
    plt.ylabel("Total Daily Ridership")
    plt.tight_layout()
    plt.show()


In [8]:
# ======================
# 4. Statistical Plots (Box & Heatmap)
# ======================

def plot_box_ridership_by_mode(util_df: pd.DataFrame) -> None:
    """
    Statistical graph:
    Box plot of yearly ridership by mode.
    """
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=util_df, x="mode", y="ridership")
    plt.title("Yearly Ridership Distribution by Mode (Box Plot)")
    plt.xlabel("Mode")
    plt.ylabel("Yearly Average Daily Ridership")
    plt.tight_layout()
    plt.show()


def plot_mode_correlation_heatmap(util_df: pd.DataFrame) -> None:
    """
    Statistical graph:
    Correlation heatmap of yearly ridership across modes.
    """
    pivot = (
        util_df.pivot_table(
            index="year",
            columns="mode",
            values="ridership",
            aggfunc="mean",
        )
        .fillna(0.0)
    )

    corr = pivot.corr(method="pearson")

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        corr,
        annot=True,
        fmt=".2f",
        cmap="coolwarm",
        vmin=-1,
        vmax=1,
        square=True,
    )
    plt.title("Correlation Between Modes (Yearly Ridership)")
    plt.tight_layout()
    plt.show()


In [9]:
# ======================
# 5. K-Means Clustering (Using Monthly Ridership)
# ======================

def prepare_monthly_clustering_features(monthly_df: pd.DataFrame):
    """
    Prepare monthly ridership by mode (wide format) for clustering.
    Index: month, Columns: mode, Values: ridership.
    """
    wide = (
        monthly_df.pivot_table(
            index="month",
            columns="mode",
            values="ridership",
            aggfunc="mean",
        )
        .fillna(0.0)
    )

    feat_df = wide.reset_index()  # month + modes
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(feat_df.drop(columns=["month"]))
    return X_scaled, feat_df


def plot_elbow_curve(X_scaled: np.ndarray, max_k: int = 8) -> None:
    """
    Elbow plot: inertia vs k.
    """
    inertias = []
    k_values = range(2, max_k + 1)

    for k in k_values:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(X_scaled)
        inertias.append(km.inertia_)

    plt.figure(figsize=(8, 5))
    plt.plot(list(k_values), inertias, marker="o")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Inertia (Within-Cluster SSE)")
    plt.title("Elbow Plot for Monthly Ridership Clustering")
    plt.xticks(list(k_values))
    plt.tight_layout()
    plt.show()


def plot_silhouette_scores(X_scaled: np.ndarray, max_k: int = 8) -> None:
    """
    Silhouette plot: average silhouette score vs k.
    """
    scores = []
    k_values = range(2, max_k + 1)

    for k in k_values:
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(X_scaled)
        scores.append(silhouette_score(X_scaled, labels))

    plt.figure(figsize=(8, 5))
    plt.plot(list(k_values), scores, marker="o")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Average Silhouette Score")
    plt.title("Silhouette Scores for Monthly Ridership Clustering")
    plt.xticks(list(k_values))
    plt.tight_layout()
    plt.show()


def perform_kmeans(X_scaled: np.ndarray, feat_df: pd.DataFrame, k: int = 3) -> pd.DataFrame:
    """
    Run K-Means clustering on monthly ridership and attach cluster labels.
    """
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_scaled)

    feat_df = feat_df.copy()
    feat_df["cluster"] = labels

    print("\nCluster sizes (monthly ridership):")
    print(feat_df["cluster"].value_counts().sort_index())
    return feat_df


def plot_cluster_scatter(feat_df: pd.DataFrame) -> None:
    """
    Scatter plot of two mode riderships, coloured by cluster.
    Uses the first two mode columns in the dataframe.
    """
    mode_cols = [c for c in feat_df.columns if c not in ["month", "cluster"]]
    if len(mode_cols) < 2:
        print("Not enough mode columns to plot cluster scatter.")
        return

    x_col, y_col = mode_cols[0], mode_cols[1]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        data=feat_df,
        x=x_col,
        y=y_col,
        hue="cluster",
        palette="tab10",
        s=100,
    )
    plt.title(f"Monthly Ridership Clusters: {x_col} vs {y_col}")
    plt.xlabel(f"{x_col} Ridership")
    plt.ylabel(f"{y_col} Ridership")
    plt.legend(title="Cluster")
    plt.tight_layout()
    plt.show()


In [10]:
# ======================
# 6. Linear Regression Fitting (Year vs Ridership)
# ======================

def fit_linear_trend(util_df: pd.DataFrame):
    """
    Fit a simple linear regression:
    yearly total ridership (all modes) ~ year.
    """
    yearly = (
        util_df.groupby("year")["ridership"]
        .sum()
        .reset_index()
        .sort_values("year")
    )

    X = yearly[["year"]].values
    y = yearly["ridership"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n=== Linear Regression: Year vs Total Ridership ===")
    print(f"Intercept: {model.intercept_:.3f}")
    print(f"Slope (per year): {model.coef_[0]:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"R² score: {r2:.3f}")

    return model, yearly


def plot_linear_fit(model: LinearRegression, yearly: pd.DataFrame) -> None:
    """
    Plot actual yearly total ridership with fitted regression line.
    """
    X_all = yearly[["year"]].values
    y_all = yearly["ridership"].values
    y_fit = model.predict(X_all)

    plt.figure(figsize=(10, 6))
    plt.scatter(yearly["year"], y_all, label="Actual", alpha=0.7)
    plt.plot(yearly["year"], y_fit, color="red", label="Fitted trend", linewidth=2)
    plt.title("Yearly Total Ridership with Linear Trend Line")
    plt.xlabel("Year")
    plt.ylabel("Total Daily Ridership")
    plt.legend()
    plt.tight_layout()
    plt.show()


In [11]:
# ======================
# 7. MAIN WORKFLOW
# ======================

def main():
    # 1. Load all three datasets from your uploaded CSVs
    monthly_df = load_monthly_ridership("monthly_ave_daily_pt_ridership.csv")
    util_df = load_yearly_utilisation("PublicTransportUtilisationAveragePublicTransportRidership.csv")
    premium_df = load_premium_bus("PremiumBusServicesCSV20241125.csv")

    print("Monthly ridership shape:", monthly_df.shape)
    print("Yearly utilisation shape:", util_df.shape)
    print("Premium bus shape:", premium_df.shape)

    # 2. Categorical graphs
    plot_bar_ridership_by_mode(monthly_df)
    plot_bar_premium_by_operator(premium_df)

    # 3. Relational graphs
    plot_line_monthly_trend(monthly_df)
    plot_line_yearly_utilisation(util_df)

    # 4. Statistical graphs
    plot_box_ridership_by_mode(util_df)
    plot_mode_correlation_heatmap(util_df)

    # 5. Clustering using monthly ridership (k-means)
    X_scaled, feat_df = prepare_monthly_clustering_features(monthly_df)
    plot_elbow_curve(X_scaled, max_k=8)
    plot_silhouette_scores(X_scaled, max_k=8)

    # Choose k based on elbow/silhouette; for demo, use k=3
    clustered_months = perform_kmeans(X_scaled, feat_df, k=3)
    plot_cluster_scatter(clustered_months)

    # 6. Linear regression fitting on yearly utilisation
    model, yearly = fit_linear_trend(util_df)
    plot_linear_fit(model, yearly)

    # Example prediction for next year (for your “prediction” slide)
    future_year = yearly["year"].max() + 1
    future_pred = model.predict(np.array([[future_year]]))[0]
    print(
        f"\nPredicted total daily ridership for year {future_year}: "
        f"{future_pred:,.0f}"
    )


if __name__ == "__main__":
    main()


KeyError: "['month'] not in index"