In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, PCA
from scipy.sparse import hstack

In [20]:
data = pd.read_csv('data/final.csv', dtype={1: str})

In [24]:
data.shape

(947751, 55)

In [26]:
states = ['CHI', 'NYC']
results = {}

for state in states:
    # Check each state individually
    df_state = data[data['state'] == state].copy().dropna().reset_index(drop=True)

    # Define columns
    exclude_cols = ['date', 'state', 'daily_ridership', 'unit_id', 'mode']
    feature_cols = [col for col in df_state.columns if col not in exclude_cols]

    # Truncated SVD on unit_id
    ohe_unit = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    unit_sparse = ohe_unit.fit_transform(df_state[['unit_id']])
    svd = TruncatedSVD(n_components=20, random_state=42)
    unit_svd = svd.fit_transform(unit_sparse)
    unit_svd_df = pd.DataFrame(unit_svd, index=df_state.index, columns=[f'unit_svd_{i+1}' for i in range(20)])

    # One-hot encode mode
    mode_dummies = pd.get_dummies(df_state['mode'], prefix='mode', drop_first=True)

    # Scale weather + AQI features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state[feature_cols])
    scaled_df = pd.DataFrame(X_scaled, index=df_state.index, columns=feature_cols)

    # Combine unit SVD, one-hot, and scaled features
    X_final = pd.concat([unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)

    # Run PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_final)

    explained_var = pd.DataFrame({
        'Component': [f'PC{i+1}' for i in range(pca.n_components_)],
        'Explained Variance Ratio': pca.explained_variance_ratio_
    })

    loadings = pd.DataFrame(
        pca.components_.T,
        index=X_final.columns,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)]
    )

    # Store results
    results[state] = {
        'X_pca': X_pca,
        'explained_var': explained_var,
        'loadings': loadings,
        'features': X_final
    }

In [36]:
results['CHI']['explained_var'].head(10)

Unnamed: 0,Component,Explained Variance Ratio
0,PC1,0.335401
1,PC2,0.142762
2,PC3,0.068325
3,PC4,0.065639
4,PC5,0.037092
5,PC6,0.034656
6,PC7,0.031899
7,PC8,0.030005
8,PC9,0.028676
9,PC10,0.026019


In [34]:
results['NYC']['explained_var'].head(10)

Unnamed: 0,Component,Explained Variance Ratio
0,PC1,0.33078
1,PC2,0.166766
2,PC3,0.071536
3,PC4,0.060635
4,PC5,0.037826
5,PC6,0.035283
6,PC7,0.032673
7,PC8,0.030552
8,PC9,0.028386
9,PC10,0.021351


In [30]:
pcs = ['PC1', 'PC2', 'PC3']
states = ['CHI', 'NYC']

for state in states:
    print([feat for feat in results[state]['loadings'].index if 'mode_' in feat])
    for pc in pcs:
        print(state,pc)
        loadings = results[state]['loadings'][pc]
        top_features = loadings.abs().sort_values(ascending=False).head(20)
        print(top_features)

['mode_train']
CHI PC1
us_aqi_mean                 0.231976
us_aqi_pm2_5_mean           0.225158
us_aqi_pm10_mean            0.224002
us_aqi_min                  0.221788
us_aqi_pm2_5_min            0.220645
us_aqi_pm10_min             0.219973
us_aqi_pm2_5_max            0.217094
us_aqi_pm10_max             0.217057
us_aqi_max                  0.215148
us_aqi_max_bin              0.203800
us_aqi_mean_bin             0.200004
us_aqi_mean_lag             0.199215
us_aqi_min_lag              0.194802
us_aqi_min_bin              0.190103
us_aqi_max_lag              0.188579
us_aqi_max_bin_lag          0.177977
us_aqi_mean_bin_lag         0.173289
us_aqi_min_bin_lag          0.171400
us_aqi_ozone_max            0.155299
apparent_temperature_max    0.145934
Name: PC1, dtype: float64
CHI PC2
us_aqi_nitrogen_dioxide_mean    0.300705
us_aqi_ozone_min                0.294836
us_aqi_nitrogen_dioxide_min     0.284489
us_aqi_ozone_mean               0.268657
us_aqi_sulphur_dioxide_mean     0.24507