In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from scipy.sparse import hstack

In [2]:
data = pd.read_csv('data/final_cleaned.csv', dtype={1: str})

In [3]:
data.shape

(947751, 55)

In [4]:
n_components = 10
states = ['CHI', 'NYC']
results = {}

for state in states:
    # Check each state individually
    df_state = data[data['state'] == state].copy().dropna().reset_index(drop=True)

    # Define columns
    exclude_cols = ['date', 'state', 'daily_ridership', 'unit_id', 'mode']
    feature_cols = [col for col in df_state.columns if col not in exclude_cols]

    # Truncated SVD on unit_id
    ohe_unit = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    unit_sparse = ohe_unit.fit_transform(df_state[['unit_id']])
    svd = TruncatedSVD(n_components=20, random_state=42)
    unit_svd = svd.fit_transform(unit_sparse)
    unit_svd_df = pd.DataFrame(unit_svd, index=df_state.index, columns=[f'unit_svd_{i+1}' for i in range(20)])

    # One-hot encode mode
    mode_dummies = pd.get_dummies(df_state['mode'], prefix='mode', drop_first=True)

    # Scale weather + AQI features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state[feature_cols])
    scaled_df = pd.DataFrame(X_scaled, index=df_state.index, columns=feature_cols)

    # Combine unit SVD, one-hot, and scaled features
    X_final = pd.concat([unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)

    # Run PCA
    pca = PCA(n_components=n_components,random_state=27)
    X_pca = pca.fit_transform(X_final)

    explained_var = pd.DataFrame({
        'Component': [f'PC{i+1}' for i in range(pca.n_components_)],
        'Explained Variance Ratio': pca.explained_variance_ratio_
    })

    loadings = pd.DataFrame(
        pca.components_.T,
        index=X_final.columns,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)]
    )

    # Store results
    results[state] = {
        'X_pca': X_pca,
        'explained_var': explained_var,
        'loadings': loadings,
        'features': X_final
    }



In [None]:
results['CHI']['explained_var'].head(10)

In [None]:
results['CHI']['explained_var'].head(10)

In [None]:
pcs = ['PC1', 'PC2', 'PC3']
states = ['CHI', 'NYC']

for state in states:
    print([feat for feat in results[state]['loadings'].index if 'mode_' in feat])
    for pc in pcs:
        print(state,pc)
        loadings = results[state]['loadings'][pc]
        top_features = loadings.abs().sort_values(ascending=False).head(20)
        print(top_features)

MemoryError: Unable to allocate 264. GiB for an array with shape (188117, 188117) and data type float64