In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)
sns.set_theme(context="paper", font_scale=1.5, style="ticks", rc={"axes.grid": True})


# Why is PCA to 100% Explained Variance so different?

***
## Load the data

In [None]:
# Read Data from CSV (NEW DATA, NOT SCALED)
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv", index_col=0)

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("gc_cds", axis=1, inplace=True) # ! New

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"]
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"]
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

# Collect Features and Labels
features_df = pd.DataFrame()
features_df["occ_total_sum"] = occ
features_df["oldest_phylostratum"] = age
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

features_df.head(5)

## Custom PCA

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Columns of confounder variables (highly colinear)
conf_index = 2
conf_cols = np.arange(2, X.shape[1])


class ConfounderPCA(BaseEstimator, TransformerMixin):
    """ 
    Custom PCA transformer for this dataset.
    Applies PCA only to the many collinear confounder 
    variables.
    
    cols - columns to which PCA will be applied.
    
    n_components - same as with the "vanilla" PCA. 
        If 0 < n_components < 1, select the number of 
        components such that the amount of variance that 
        needs to be explained is greater than the 
        percentage specified by n_components.
        
    apply_PCA - if false, simply returns the untransformed data.
    """

    def __init__(self, cols, n_components=None, apply_PCA=True):
        self.n_components = n_components
        self.apply_PCA = apply_PCA
        self.cols = cols
        if self.apply_PCA:
            self.pca = PCA(n_components=self.n_components)

    def fit(self, X, y=None):
        if self.apply_PCA:
            self.pca.fit(X[:, self.cols])
        return self

    def transform(self, X, y=None):
        if self.apply_PCA:
            X_pca = self.pca.transform(X[:, self.cols])
            return np.c_[X[:, :2], X_pca]
        else:
            return X


sns.heatmap(
    pd.DataFrame(
        ConfounderPCA(cols=np.arange(2, X.shape[1])).fit_transform(StandardScaler().fit_transform(X))
    ).corr()
)
plt.title("Correlation Matrix after PCA")
plt.show()

print(X.shape[1], "total features.")
print("Confounder columns start from index", conf_index, "of feature matrix.")
print("Non-counfounders:", features_df.iloc[:, 0:conf_index].columns.tolist())

features_df

## Evaluate

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

plt.figure(figsize=(12,5))
for i in [0.9, 0.95, 0.99, 0.999, None]:

    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", ConfounderPCA(cols=np.arange(2, X.shape[1]), n_components=i)),
        ("lr", LogisticRegression(max_iter=2000,)),
    ])
    clf.fit(X_train, y_train)
        
    if i != None:
        imp = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=int(i*5), scoring='roc_auc', n_jobs=-1)
        imp_df = pd.DataFrame(imp.importances_mean, columns=["Score"])
        imp_scores = imp_df.sort_values(by="Score", key=abs, ascending=False).to_numpy()
        sns.lineplot(x=np.arange(83), y=imp_scores.flatten(), label=f"{i*100}% EV") 
    else:
        imp = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=42, scoring='roc_auc', n_jobs=-1)
        imp_df = pd.DataFrame(imp.importances_mean, columns=["Score"])
        imp_scores = imp_df.sort_values(by="Score", key=abs, ascending=False).to_numpy()
        sns.lineplot(x=np.arange(83), y=imp_scores.flatten(), label="100% EV") 

    plt.title("LR Feature Importance by Absolute Effect on ROC-AUC Score")
    plt.ylabel("ROC-AUC Score Impact")
    plt.xlabel("Feature Rank")
    
plt.show()