## Imports
Load core scientific Python libraries and scikit-learn utilities.

In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import ipywidgets as widgets
from IPython.display import display, clear_output

## Load data
Read the input CSV and inspect available columns.

In [None]:
file = "Model Input/ML_DL_input.csv"
df = pd.read_csv(file) 

print(df.columns.tolist())

## Parse time fields
Convert Hour and Date columns to datetime and extract hour/month.

## Cyclical time encoding
Encode hour-of-day and month-of-year using sine/cosine transforms.

In [None]:
df['Hour'] = pd.to_datetime(df['Hour'], format='%H:%M')
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['hr']=df['Hour'].dt.hour
df['mnth']=df['Date'].dt.month
print(df['hr'].unique(),df['mnth'].unique())

df['hr_sin'] = np.sin(df.hr*(2.*np.pi/24))
df['hr_cos'] = np.cos(df.hr*(2.*np.pi/24))
df['mnth_sin'] = np.sin((df.mnth-1)*(2.*np.pi/12))
df['mnth_cos'] = np.cos((df.mnth-1)*(2.*np.pi/12))

## Variable groups
Define feature groups for photo channels, meteorology, engineered terms, and time features.


In [None]:
Va_Phtoto=['R_S_M', 'G_S_M', 'B_S_M', 'R_G_M', 'G_G_M', 'B_G_M']

Va_AQI=[ 'PM25']

Va_Met_Ground=['T2M', 'BLH', 'U_10', 'V_10', 'TP', 'SP']
Va_Met_Levels=['T_500', 'U_500', 'V_500', 'T_850', 'U_850', 'V_850']

Va_TE=['GE_500','KE_850']
Va_Time=['hr_sin', 'hr_cos','mnth_sin', 'mnth_cos']

## Density + energy engineering
Compute air density at 500/850 hPa (approx.) and derive KE/GE terms, then append to df.


In [None]:
T_500=df['T_500']
T_850=df['T_850']
Ro_500=T_500.map(lambda x: 50000*29/(8314*(x+273.15)))
Ro_850=T_850.map(lambda x: 85000*29/(8314*(x+273.15)))

U_500=df['U_500']
V_500=df['V_500']
U_850=df['U_850']
V_850=df['V_850']

KE_500=0.5*Ro_500*(U_500**2+V_500**2)
KE_850=0.5*Ro_850*(U_850**2+V_850**2)

GP_500=df['GP_500']
GP_850=df['GP_850']
GE_500=Ro_500*GP_500
GE_850=Ro_850*GP_850

df['GE_500']=GE_500
df['KE_500']=KE_500
df['GE_850']=GE_850
df['KE_850']=KE_850

## Day/night split functions
Create helper functions to subset daytime and nighttime samples using R_S_M threshold.

In [None]:
def getdata_day(df):
    va=df.columns.values.tolist()
    day=df[va][df['R_S_M'] > 100]
    return day

def getdata_night(df):
    va=df.columns.values.tolist()
    night=df[va][df['R_S_M'] <100]
    return night

In [None]:
R_B_Sky = df['R_S_M']/df['B_S_M']
RGB_Sky = df['R_S_M']+df['G_S_M']+df['B_S_M']

R_B_Ground = df['R_G_M']/df['B_G_M']
RGB_Ground = df['R_G_M']+df['G_G_M']+df['B_G_M']

df['R_B_Sky']=R_B_Sky
df['RGB_Sky']=RGB_Sky

df['R_B_Ground']=R_B_Ground
df['RGB_Ground']=RGB_Ground

PM25 = np.log(df['PM25'])

GE_500=df['GE_500']
BLH=df['BLH']

dfs1_day=getdata_day(df)
dfs1_night=getdata_night(df)
print(dfs1_day.shape,dfs1_night.shape)

## Interactive subset selector
Use ipywidgets to switch between day and night subsets.


In [None]:
dfs = {"day": dfs1_day, "night": dfs1_night}

subset_dd = widgets.Dropdown(
    options=["day", "night"],
    value="day",
    description="Subset:",
)

out = widgets.Output()

def set_df(change=None):
    global df
    df = dfs[subset_dd.value].copy()
    with out:
        clear_output(wait=True)
        print(f"Selected: {subset_dd.value}  |  df shape: {df.shape}")

subset_dd.observe(set_df, names="value")
display(subset_dd, out)

set_df()

In [None]:
y_cols= ['PM25']

x_cols=[
 'KE_850',
 'GE_500',
 'R_B_Sky',
 'R_B_Ground',
 'RGB_Sky',
 'RGB_Ground'
] +Va_Time+Va_Met_Ground

df=df.dropna()

X=df[x_cols]
y=np.log(df[y_cols])
print(math.pow(np.e, 5))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=91)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

## Depth sweep utility
Train DecisionTreeRegressor with different max_depth values and plot train vs test scores.

In [None]:
def test_DecisionTreeRegressor_depth(*data,maxdepth):
    X_train,X_test,y_train,y_test=data
    depths=np.arange(1,maxdepth)
    training_scores=[]
    testing_scores=[]
    for depth in depths:
        regr = DecisionTreeRegressor(max_depth=depth)
        regr.fit(X_train, y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))

    fig=plt.figure(dpi= 300)
    ax=fig.add_subplot(1,1,1)
    ax.plot(depths,training_scores,label="training score")
    ax.plot(depths,testing_scores,label="testing score")
    ax.set_xlabel("maxdepth")
    ax.set_ylabel("score")
    ax.set_title("Daytime Decision Tree Regression")
    ax.legend(framealpha=0.5)
    #plt.savefig('Daytime Decision Tree Regression.png')
    plt.show()
       
test_DecisionTreeRegressor_depth(X_train,X_test,y_train,y_test,maxdepth=12)

## Fit final tree + RMSE
Fit a DecisionTreeRegressor and compute RMSE on train and test sets.


In [None]:
regr = DecisionTreeRegressor(max_depth=8, random_state=0)
regr.fit(X_train, y_train)

y_train_pred = regr.predict(X_train)
y_test_pred  = regr.predict(X_test)

rmse_train = math.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test  = math.sqrt(mean_squared_error(y_test,  y_test_pred))
print(f"RMSE train: {rmse_train:.3f}, RMSE test: {rmse_test:.3f}")

plt.figure(figsize=(8, 6), dpi=150)
plt.scatter(y_test, y_test_pred, s=40, alpha=0.5, label="Test")
plt.scatter(y_train, y_train_pred, s=40, alpha=0.5, label="Train")

lims = [
    min(np.min(y_train), np.min(y_test)),
    max(np.max(y_train), np.max(y_test)),
]
plt.plot(lims, lims, linewidth=2, alpha=0.6)  
plt.xlim(lims); plt.ylim(lims)

plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.title("Decision Tree: Observed vs Predicted")
plt.legend()
plt.show()