In [55]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from scipy.sparse import hstack

In [43]:
data = pd.read_csv('data/final_cleaned.csv', dtype={1: str})

In [57]:
n_components = 10
states = ['CHI', 'NYC']
results = {}

for state in states:
    # Check each state individually
    df_state = data[data['state'] == state].copy().dropna().reset_index(drop=True)

    # Define columns
    exclude_cols = ['date', 'state', 'daily_ridership', 'unit_id', 'mode']
    excluded_df = df_state[exclude_cols].reset_index(drop=True)
    feature_cols = [col for col in df_state.columns if col not in exclude_cols]

    # Truncated SVD on unit_id
    ohe_unit = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    unit_sparse = ohe_unit.fit_transform(df_state[['unit_id']])
    svd = TruncatedSVD(n_components=20, random_state=42)
    unit_svd = svd.fit_transform(unit_sparse)
    unit_svd_df = pd.DataFrame(unit_svd, index=df_state.index, columns=[f'unit_svd_{i+1}' for i in range(20)])

    # One-hot encode mode
    mode_dummies = pd.get_dummies(df_state['mode'], prefix='mode', drop_first=True)

    # Scale weather + AQI features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state[feature_cols])
    scaled_df = pd.DataFrame(X_scaled, index=df_state.index, columns=feature_cols)

    # Combine unit SVD, one-hot, and scaled features
    X_final = pd.concat([unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)


    # Run PCA
    pca = PCA(n_components=n_components,random_state=27)
    X_pca = pca.fit_transform(X_final)

    explained_var = pd.DataFrame({
        'Component': [f'PC{i+1}' for i in range(pca.n_components_)],
        'Explained Variance Ratio': pca.explained_variance_ratio_
    })

    loadings = pd.DataFrame(
        pca.components_.T,
        index=X_final.columns,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)]
    )

    # Store results
    results[state] = {
        'X_pca': X_pca,
        'explained_var': explained_var,
        'loadings': loadings,
        'features': X_final,
    }

    X_final_comp = pd.concat([excluded_df, unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)
    X_final_comp.to_csv(f"data/{state}_trunc.csv", index=False)



In [11]:
results['NYC']['explained_var'].head(10)

Unnamed: 0,Component,Explained Variance Ratio
0,PC1,0.330406
1,PC2,0.162519
2,PC3,0.072507
3,PC4,0.061531
4,PC5,0.037898
5,PC6,0.035362
6,PC7,0.032776
7,PC8,0.030804
8,PC9,0.028617
9,PC10,0.021893


In [49]:
results['CHI']['features'].columns

Index(['unit_svd_1', 'unit_svd_2', 'unit_svd_3', 'unit_svd_4', 'unit_svd_5',
       'unit_svd_6', 'unit_svd_7', 'unit_svd_8', 'unit_svd_9', 'unit_svd_10',
       'unit_svd_11', 'unit_svd_12', 'unit_svd_13', 'unit_svd_14',
       'unit_svd_15', 'unit_svd_16', 'unit_svd_17', 'unit_svd_18',
       'unit_svd_19', 'unit_svd_20', 'mode_train', 'rain_sum', 'rain_max',
       'snowfall_sum', 'snowfall_max', 'relative_humidity_2m_min',
       'relative_humidity_2m_max', 'relative_humidity_2m_mean',
       'apparent_temperature_min', 'apparent_temperature_max',
       'wind_speed_10m_min', 'wind_speed_10m_max', 'wind_speed_10m_mean',
       'wind_direction_10m_min', 'wind_direction_10m_max',
       'wind_direction_10m_mean', 'apparent_temperature_min_lag',
       'apparent_temperature_max_lag', 'us_aqi_pm2_5_min', 'us_aqi_pm2_5_max',
       'us_aqi_pm2_5_mean', 'us_aqi_min', 'us_aqi_max', 'us_aqi_mean',
       'us_aqi_pm10_min', 'us_aqi_pm10_max', 'us_aqi_pm10_mean',
       'us_aqi_nitrogen_diox

In [15]:
pcs = ['PC1', 'PC2', 'PC3', 'PC4']
states = ['CHI', 'NYC']

for state in states:
    print([feat for feat in results[state]['loadings'].index if 'mode_' in feat])
    for pc in pcs:
        print(state,pc)
        loadings = results[state]['loadings'][pc]
        top_features = loadings.abs().sort_values(ascending=False).head(20)
        print(top_features)

['mode_train']
CHI PC1
us_aqi_mean                 0.231976
us_aqi_pm2_5_mean           0.225158
us_aqi_pm10_mean            0.224002
us_aqi_min                  0.221788
us_aqi_pm2_5_min            0.220645
us_aqi_pm10_min             0.219973
us_aqi_pm2_5_max            0.217094
us_aqi_pm10_max             0.217057
us_aqi_max                  0.215148
us_aqi_max_bin              0.203800
us_aqi_mean_bin             0.200004
us_aqi_mean_lag             0.199215
us_aqi_min_lag              0.194802
us_aqi_min_bin              0.190103
us_aqi_max_lag              0.188579
us_aqi_max_bin_lag          0.177977
us_aqi_mean_bin_lag         0.173289
us_aqi_min_bin_lag          0.171400
us_aqi_ozone_max            0.155299
apparent_temperature_max    0.145934
Name: PC1, dtype: float64
CHI PC2
us_aqi_nitrogen_dioxide_mean    0.300705
us_aqi_ozone_min                0.294836
us_aqi_nitrogen_dioxide_min     0.284489
us_aqi_ozone_mean               0.268657
us_aqi_sulphur_dioxide_mean     0.24507

In [67]:
# Define columns to keep
cols_to_keep_base = [
    'unit_svd_1', 'unit_svd_2', 'unit_svd_3', 'unit_svd_4', 'unit_svd_5',
    'unit_svd_6', 'unit_svd_7', 'unit_svd_8', 'unit_svd_9', 'unit_svd_10',
    'unit_svd_11', 'unit_svd_12', 'unit_svd_13', 'unit_svd_14', 'unit_svd_15',
    'unit_svd_16', 'unit_svd_17', 'unit_svd_18', 'unit_svd_19', 'unit_svd_20',
    'us_aqi_mean', 'us_aqi_pm2_5_mean', 'us_aqi_pm10_mean',
    'us_aqi_nitrogen_dioxide_mean', 'us_aqi_ozone_mean',
    'us_aqi_sulphur_dioxide_mean', 'us_aqi_carbon_monoxide_mean',
    'apparent_temperature_max', 'apparent_temperature_min',
    'apparent_temperature_max_lag', 'apparent_temperature_min_lag',
    'relative_humidity_2m_mean', 'rain_sum', 'rain_max', 'snowfall_sum',
    'snowfall_max', 'wind_speed_10m_mean', 'wind_speed_10m_max',
    'wind_direction_10m_mean', 'is_weekend', 'is_holiday',
    'is_holiday_adjacent', 'date', 'state', 'daily_ridership', 'unit_id', 'mode'
]


In [69]:
files = {
    "data/CHI_trunc.csv": "mode_train",
    "data/NYC_trunc.csv": "mode_subway"
}

for filepath, mode_col in files.items():
    df = pd.read_csv(filepath,low_memory=False)
    
    # Ensure the correct mode column exists
    if mode_col not in df.columns:
        raise ValueError(f"Expected column '{mode_col}' not found in {filepath}")
    
    cols_to_keep = cols_to_keep_base + [mode_col]

    # Filter to desired columns
    df_trimmed = df[cols_to_keep]

    # Save back to the same file
    df_trimmed.to_csv(filepath, index=False)