In [3]:
# PCR ANALYSIS
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Load Merged DF
Merged_Data = pd.read_excel(
    r"C:/Users/tejas/Downloads/Merged_Data.xlsx",
    sheet_name="Sheet1"
)

# Walkability predictor
walkability_cols = [
    "pct_low_wage_emp", "pct_med_wage_emp", "pct_hi_wage_emp",
    "pct_low_wage_wrk", "pct_med_wage_wrk", "pct_hi_wage_wrk",
    "0_autos_pct", "1_autos_pct", "2_autos_pct",
    "wtd_WrkAge_pop_pct", "wtd_avg_walk_index"
]

# Data Cleaning
df = Merged_Data[walkability_cols + ["Obesity among adults"]].copy()

df["Obesity_clean"] = (
    df["Obesity among adults"]
      .astype(str)        # ensure string
      .str.strip()        # remove spaces
      .str.rstrip("%")    # drop % sign if present
)

df["Obesity_clean"] = pd.to_numeric(df["Obesity_clean"], errors="coerce")
df = df.dropna(subset=walkability_cols + ["Obesity_clean"])

X = df[walkability_cols]
y = df["Obesity_clean"]     

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA Analysis

n_components = min(5, X_scaled.shape[1])
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

print("Explained variance for selected components:")
for i, ev in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {ev:.4f}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\nPCR Model Results:")
print(f"R-squared: {r2:.4f}")
print(f"MSE: {mse:.4f}")

pc_names = [f"PC{i+1}" for i in range(n_components)]
coeffs = pd.Series(model.coef_, index=pc_names)

print("\nRegression coefficients on PCs:")
print(coeffs)

loading_matrix = pd.DataFrame(
    pca.components_.T,          # predictors x components
    index=walkability_cols,
    columns=pc_names
)

contrib = loading_matrix.mul(coeffs.values, axis=1).sum(axis=1)
contrib = contrib.sort_values(ascending=False)

print("\nFeature contributions to Obesity (PCR interpretation):")
print(contrib)

Explained variance for selected components:
PC1: 0.4509
PC2: 0.1777
PC3: 0.0918
PC4: 0.0833
PC5: 0.0741

PCR Model Results:
R-squared: 0.0746
MSE: 7.0882

Regression coefficients on PCs:
PC1   -0.659696
PC2   -0.406627
PC3   -0.405835
PC4   -0.119786
PC5   -0.685819
dtype: float64

Feature contributions to Obesity (PCR interpretation):
1_autos_pct           0.628149
pct_low_wage_wrk      0.288955
pct_med_wage_wrk      0.214399
0_autos_pct           0.153670
pct_hi_wage_emp       0.073366
pct_med_wage_emp      0.038303
wtd_WrkAge_pop_pct   -0.034723
pct_low_wage_emp     -0.163211
pct_hi_wage_wrk      -0.281106
wtd_avg_walk_index   -0.515249
2_autos_pct          -0.568418
dtype: float64
