# Milestone II PCA Analysis

### Primary Component Analysis 

In [21]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from scipy.sparse import hstack

In [3]:
data = pd.read_csv('data/final_cleaned.csv', dtype={1: str})

In [19]:
n_components = 10
states = ['CHI', 'NYC']
results = {}

for state in states:
    # Check each state individually
    df_state = data[data['state'] == state].copy().dropna().reset_index(drop=True)

    # Define columns
    exclude_cols = ['date', 'state', 'daily_ridership', 'unit_id', 'mode']
    feature_cols = [col for col in df_state.columns if col not in exclude_cols]

    # Truncated SVD on unit_id
    ohe_unit = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    unit_sparse = ohe_unit.fit_transform(df_state[['unit_id']])
    svd = TruncatedSVD(n_components=20, random_state=42)
    unit_svd = svd.fit_transform(unit_sparse)
    unit_svd_df = pd.DataFrame(unit_svd, index=df_state.index, columns=[f'unit_svd_{i+1}' for i in range(20)])

    # One-hot encode mode
    mode_dummies = pd.get_dummies(df_state['mode'], prefix='mode', drop_first=True)

    # Scale weather + AQI features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state[feature_cols])
    scaled_df = pd.DataFrame(X_scaled, index=df_state.index, columns=feature_cols)

    # Combine unit SVD, one-hot, and scaled features
    X_final = pd.concat([unit_svd_df, mode_dummies.reset_index(drop=True), scaled_df], axis=1)
    #X_final = X_final.sample(100)
    
    # Run PCA
    Kpca = KernelPCA(n_components=n_components)
    X_train_pca = Kpca.fit_transform(X_final)

    
    # Get eigenvalues for features, set up for feature importance
    eigenvalues = Kpca.eigenvalues_
    feature_loadings = np.zeros((X_final.shape[1], n_components))
    
    # Get correlations for each component
    for i in range(n_components):
        pc_values = X_train_pca[:,i]
        for j, feature_name in enumerate(X_final.columns):
            feature_values = X_final.iloc[:,j].values
            correlation = np.corrcoef(pc_values, feature_values)[0,1]
            feature_loadings[j, i] = correlation if not np.isnan(correlation) else 0
    
    # Get top features, create df with them
    feature_importance_scores = np.sum(np.abs(feature_loadings), axis=1)

    feature_importance_df = pd.DataFrame({
        'feature': X_final.columns,
        'importance_score': feature_importance_scores
    }).sort_values('importance_score', ascending=False)
    
    top_features = feature_importance_df
    
    # Add individual component loadings to top features
    loading_cols = [f'PC{i+1}_loading' for i in range(n_components)]
    for i, col in enumerate(loading_cols):
        top_features[col] = [feature_loadings[X_final.columns.get_loc(feat), i] 
                            for feat in top_features['feature']]

    top_features.to_csv("top_features.csv", index=False)
    
    print("Kernel PCA Results:")
    print(f"Eigenvalues: {eigenvalues}")
    print(f"\nTop {top_features} features:")
    print(top_features.to_string(index=False))


    # Save PCA transformed data and model output
    joblib.dump(X_train_pca, "X_train_pca.pkl")
    joblib.dump(kpca.eigenvalues_, "eigenvalues.pkl")
    joblib.dump(feature_loadings, "feature_loadings.pkl")
    feature_importance_df.to_csv("top_features.csv", index=False)

    results = (X_train_pca, eigenvalues, top_features, feature_loadings)

MemoryError: Unable to allocate 264. GiB for an array with shape (188117, 188117) and data type float64