<a href="https://colab.research.google.com/github/crogers-cardiffmet/Data-Analysis/blob/main/AirData_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/Air_Data/'

/content/drive/MyDrive/Air_Data


In [3]:
!pip install streamlit



In [12]:
!python data_loader.py



In [19]:
%%writefile data_overview.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_data

def app():
    st.title("Data Overview")
    df = load_data()

    ext_ints = [
        c for c in df.columns
        if pd.api.types.is_integer_dtype(df[c].dtype) and df[c].isnull().any()
    ]
    for col in ext_ints:
        df[col] = df[col].astype("float64")

    st.subheader("Sample & Shape")
    st.dataframe(df.head())
    st.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

    # Basic info
    st.subheader("Sample & Shape")
    st.dataframe(df.head())
    st.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

    # Date filter
    if 'date' in df.columns:
        st.subheader("Filter by Date")
        min_date, max_date = st.date_input(
            "Date range",
            value=(df['date'].min().date(), df['date'].max().date())
        )
        mask = (df['date'].dt.date >= min_date) & (df['date'].dt.date <= max_date)
        df_filtered = df.loc[mask]
        st.write(f"Showing {df_filtered.shape[0]} rows between {min_date} and {max_date}")
        st.dataframe(df_filtered.head())

    # Show missing values
    st.subheader("Missing Values")
    missing = pd.DataFrame({
        'count': df.isna().sum(),
        '%': 100*df.isna().mean()
    }).query("count>0")
    st.dataframe(missing.style.background_gradient("Reds"))
    st.subheader("Data types")
    st.write(df.dtypes)

    # Station and wind direction counts
    st.subheader("Station Counts")
    st.bar_chart(df['station'].value_counts())
    st.subheader("Wind Direction Counts")
    st.bar_chart(df['wd'].value_counts())

    # Correlation matrix
    st.subheader("Correlation Matrix")
    numeric_df = df.select_dtypes(include=[np.number])
    corr_with_pm25 = numeric_df.corr()['PM2.5'].sort_values(ascending=False).drop('PM2.5')
    st.table(corr_with_pm25.to_frame(name="Correlation"))

    # PM2.5 distribution
    st.subheader("PM2.5 Distribution")
    fig, ax = plt.subplots(figsize=(6,3))
    sns.histplot(df['PM2.5'], bins=30, kde=True, ax=ax)
    st.pyplot(fig)


Overwriting data_overview.py


In [20]:
%%writefile eda.py

import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from data_loader import load_data

# Defining wind direction order for consistency
wind_dir_order = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE',
                  'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']

def app():
    st.title("Exploratory Data Analysis")

    df = load_data()
    numeric = ['PM2.5','PM10','SO2','NO2','CO','O3']

    # Time range & granularity
    st.subheader("Time Range & Granularity")
    min_date, max_date = st.date_input(
    "Select a date range",
    value=(df['date'].min().date(), df['date'].max().date())
    )
    mask = (df['date'].dt.date >= min_date) & (df['date'].dt.date <= max_date)
    df_slice = df.loc[mask]

    gran = st.radio("Resample frequency", ['D','W','M'], index=2, format_func=lambda x: {'D':'Daily','W':'Weekly','M':'Monthly'}[x])
    ts = df_slice.set_index('date')[numeric].resample(gran).mean()
    st.line_chart(ts)

    # Distributions
    st.subheader("Distributions")
    fig, axes = plt.subplots(2, 3, figsize=(15,6))
    for i, col in enumerate(numeric):
        ax = axes.flatten()[i]
        sns.histplot(df_slice[col].dropna(), kde=True, ax=ax)
        ax.set_title(col)
    st.pyplot(fig)

    # Boxplots
    st.subheader("Boxplots")
    fig, axes = plt.subplots(2, 3, figsize=(15,6))
    for i, col in enumerate(numeric):
        ax = axes.flatten()[i]
        sns.boxplot(x=df_slice[col].dropna(), ax=ax, color='orange')
        ax.set_title(col)
    st.pyplot(fig)

    # Correlation heatmap
    st.subheader("Correlation Heatmap")
    corr = df_slice[numeric].corr()
    fig, ax = plt.subplots(figsize=(6,5))
    sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
    st.pyplot(fig)

    # Pairwise scatter matrix
    st.subheader("Pairwise Scatter Matrix")
    cols = st.multiselect("Pick features for pairplot", numeric, default=numeric[:4])
    if len(cols) >= 2:
        sample = df_slice[cols].dropna().sample(min(len(df_slice), 2000), random_state=42)
        fig = sns.pairplot(sample, corner=True, plot_kws={'alpha':0.3})
        st.pyplot(fig)
    else:
        st.info("Select at least two features to see a pairplot.")

    # Seasonal distributions
    st.subheader("Seasonal Distributions")
    seasons = {12:'Winter',1:'Winter',2:'Winter',
               3:'Spring',4:'Spring',5:'Spring',
               6:'Summer',7:'Summer',8:'Summer',
               9:'Fall',10:'Fall',11:'Fall'}
    df_slice['season'] = df_slice['month'].map(seasons)
    sel_pollutant = st.selectbox("Pick a pollutant for season boxplot", numeric)
    fig, ax = plt.subplots(figsize=(8,4))
    sns.boxplot(x='season', y=sel_pollutant, data=df_slice,
                order=['Winter','Spring','Summer','Fall'], ax=ax)
    ax.set_title(f"{sel_pollutant} by Season")
    st.pyplot(fig)

    # Custom scatter plot
    st.subheader("Custom Scatter Plot")
    xcol = st.selectbox("X axis", numeric, index=0)
    ycol = st.selectbox("Y axis", numeric, index=1)
    hue = st.selectbox("Color by", [None] + numeric)
    fig, ax = plt.subplots(figsize=(6,4))
    if hue:
        sc = ax.scatter(df_slice[xcol], df_slice[ycol], c=df_slice[hue], cmap='viridis', alpha=0.5)
        plt.colorbar(sc, ax=ax, label=hue)
    else:
        ax.scatter(df_slice[xcol], df_slice[ycol], alpha=0.5)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
    ax.set_title(f"{ycol} vs {xcol}")
    st.pyplot(fig)

Overwriting eda.py


In [21]:
!python train_and_save.py

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time= 3.4min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time= 3.4min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time= 2.1min


In [22]:
%%writefile modeling_prediction.py

import streamlit as st
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score

from data_loader import load_data

@st.cache_resource
def load_model_and_scaler():
    rf     = joblib.load("rf_model.pkl")
    scaler = joblib.load("scaler.pkl")
    return rf, scaler

@st.cache_data
def load_test_set():
    X_test = pd.read_csv("X_test.csv")
    y_test = pd.read_csv("y_test.csv").squeeze()
    return X_test, y_test

def app():
    st.title("Modeling & Prediction")

    df = load_data()

    ext_ints = [
        c for c in df.columns
        if pd.api.types.is_integer_dtype(df[c].dtype) and df[c].isnull().any()
    ]
    for col in ext_ints:
        df[col] = df[col].astype("float64")

    st.subheader("Sample & Shape")
    st.dataframe(df.head())
    st.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    rf, sc  = load_model_and_scaler()
    X_test, y_test = load_test_set()

    X_test_s = sc.transform(X_test)
    y_pred   = rf.predict(X_test_s)

    # Metrics
    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)
    st.subheader("Model Performance")
    st.write(f"- **MSE:**  {mse:.2f}")
    st.write(f"- **RMSE:** {rmse:.2f}")
    st.write(f"- **R²:**   {r2:.3f}")

    # Feature importance
    features = ['PM10','SO2','NO2','CO','O3']
    imp = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
    st.subheader("Feature Importance")
    st.bar_chart(imp)

    # Actual vs predicted chart
    st.subheader("Actual vs Predicted PM2.5")
    comp = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    st.line_chart(comp.reset_index(drop=True))

    # Residuals distribution
    st.subheader("Residuals Distribution")
    resid = y_test - y_pred
    fig, ax = plt.subplots()
    sns.histplot(resid, kde=True, ax=ax)
    st.pyplot(fig)

    # User prediction
    st.subheader("Make Your Own Prediction")
    user = {}
    for f in features:
        user[f] = st.number_input(f, float(df[f].min()), float(df[f].max()), float(df[f].median()))
    if st.button("Predict"):
        X_new = sc.transform(pd.DataFrame([user]))
        out   = rf.predict(X_new)[0]
        st.success(f"Predicted PM2.5: {out:.2f} µg/m³")


Overwriting modeling_prediction.py
