In [50]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, PCA
from scipy.sparse import hstack

In [7]:
data = pd.read_csv('data/final.csv', dtype={1: str})

In [62]:
states = ['CHI', 'NYC']
results = {}

for state in states:
    # Check each state individually
    df_state = data[data['state'] == state].copy().dropna().reset_index(drop=True)

    # Define columns
    exclude_cols = ['date', 'state', 'daily_ridership', 'unit_id', 'mode']
    feature_cols = [col for col in df_state.columns if col not in exclude_cols]

    # Truncated SVD on unit_id
    ohe_unit = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    unit_sparse = ohe_unit.fit_transform(df_state[['unit_id']])
    svd = TruncatedSVD(n_components=20, random_state=42)
    unit_svd = svd.fit_transform(unit_sparse)
    unit_svd_df = pd.DataFrame(unit_svd, index=df_state.index, columns=[f'unit_svd_{i+1}' for i in range(20)])

    # One-hot encode mode
    mode_dummies = pd.get_dummies(df_state['mode'], prefix='mode', drop_first=True)

    # Scale weather + AQI features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state[feature_cols])
    scaled_df = pd.DataFrame(X_scaled, index=df_state.index, columns=feature_cols)

    # Combine unit SVD, one-hot, and scaled features
    X_final = pd.concat([unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)

    # Run PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_final)

    explained_var = pd.DataFrame({
        'Component': [f'PC{i+1}' for i in range(pca.n_components_)],
        'Explained Variance Ratio': pca.explained_variance_ratio_
    })

    loadings = pd.DataFrame(
        pca.components_.T,
        index=X_final.columns,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)]
    )

    # Store results
    results[state] = {
        'X_pca': X_pca,
        'explained_var': explained_var,
        'loadings': loadings,
        'features': X_final
    }

unit_svd_1                     0
unit_svd_2                     0
unit_svd_3                     0
unit_svd_4                     0
unit_svd_5                     0
                              ..
us_aqi_sulphur_dioxide_mean    0
us_aqi_min_lag                 0
us_aqi_max_lag                 0
us_aqi_mean_lag                0
is_weekend                     0
Length: 63, dtype: int64
unit_svd_1                     0
unit_svd_2                     0
unit_svd_3                     0
unit_svd_4                     0
unit_svd_5                     0
                              ..
us_aqi_sulphur_dioxide_mean    0
us_aqi_min_lag                 0
us_aqi_max_lag                 0
us_aqi_mean_lag                0
is_weekend                     0
Length: 63, dtype: int64


In [72]:
results['CHI']['explained_var'].head(10)

Unnamed: 0,Component,Explained Variance Ratio
0,PC1,0.298562
1,PC2,0.163064
2,PC3,0.079444
3,PC4,0.072997
4,PC5,0.041838
5,PC6,0.040887
6,PC7,0.035113
7,PC8,0.03441
8,PC9,0.028542
9,PC10,0.023702


In [84]:
pcs = ['PC1', 'PC2', 'PC3']
states = ['CHI', 'NYC']

for state in states:
    for pc in pcs:
        print(state,pc)
        loadings = results[state]['loadings'][pc]
        top_features = loadings.abs().sort_values(ascending=False).head(20)
        print(top_features)

CHI PC1
us_aqi_pm2_5_mean               0.260497
us_aqi_pm10_mean                0.258135
us_aqi_pm2_5_min                0.254400
us_aqi_pm10_min                 0.252696
us_aqi_pm2_5_max                0.249275
us_aqi_pm10_max                 0.248251
us_aqi_mean                     0.231401
us_aqi_max                      0.229205
us_aqi_min                      0.218041
us_aqi_max_lag                  0.187678
us_aqi_min_lag                  0.185465
us_aqi_mean_lag                 0.184984
us_aqi_carbon_monoxide_mean     0.167343
apparent_temperature_max        0.164678
us_aqi_ozone_max                0.164581
us_aqi_carbon_monoxide_max      0.157733
apparent_temperature_min        0.157426
apparent_temperature_max_lag    0.150351
wind_speed_10m_min              0.142728
us_aqi_carbon_monoxide_min      0.138680
Name: PC1, dtype: float64
CHI PC2
us_aqi_ozone_min                0.306225
us_aqi_ozone_mean               0.293748
us_aqi_nitrogen_dioxide_mean    0.288402
us_aqi_nitrogen