Condense features into X, y and run some simple regressions, plotting the accuracy and ROC curves

In [None]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import sys
import umap
from sklearn.linear_model import LogisticRegressionCV, LassoCV
import torch
from torchmetrics import ROC, AUROC
sys.path.append("../src")
from datasets import *
from models import *

metadata_path = "../Data/UCSF-PDGM-metadata_v2.csv"
y_fn = lambda metadata, ID: alive_fn(metadata, ID, 365)
#y_fn = days_alive_fn

In [None]:
np.random.seed(0)
perc = 0.9 # Percent to use for training
perm_train = np.array([]) # permutation for train/test splitting
perm_test = np.array([])

features = ["alpha_total", "connectedcomponents", "cubical_total", "d2", "shapehist", "shapehistpca", "shapeshellhist", "spinimages"]
data = {}
roc = ROC(task="binary")
for feature in features:
    X, y = condense_dataset("../preprocessed/{}".format(feature), metadata_path, y_fn)
    if perm_train.size == 0:
        # Setup train/test split
        N = int(perc*X.shape[0])
        perm = np.random.permutation(X.shape[0])
        perm_train = perm[0:N]
        perm_test = perm[N::]
    
    X_train, y_train = X[perm_train, :], y[perm_train]
    X_test, y_test = X[perm_test, :], y[perm_test]
    data[feature] = {"X_train":X_train, "y_train":y_train}
    pickle.dump({"X":X, "y":y}, open("../{}.pkl".format(feature), "wb"))
    
    
    clf = LogisticRegressionCV(cv=5, random_state=0).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    pred = torch.from_numpy(np.array(clf.predict_proba(X_test)[:, 1]))
    target = torch.from_numpy(np.array(y_test, dtype=int))
    data[feature]["roc"] = roc(pred, target)
    data[feature]["clf"] = clf
    data[feature]["roc"]
    data[feature]["score"] = score
    print(feature, score)

In [None]:
s = "<table><tr><td><h1>Feature</h1></td><td><h1>Num Dimensions</h1></td><td><center><h1>Accuracy</h1></center></td><td><center><h1>ROC</h1></center></td></tr>"
for key in data.keys():
    dim = data[key]["X_train"].shape[1]
    (fp, tp, _) = data[key]["roc"]
    roc_img = get_roc_image_html(fp, tp, key, (4, 4))
    score = data[key]["score"]
    s += "<tr><td><center><h3>{}</h3></center></td><td><center><h3>{}</h3></center></td><td><center><h3>{:.3f}</h3></center></td><td>{}</td></tr>\n".format(key, dim, score, roc_img)
s += "</table>"
display(HTML(s))
fout = open("results.html", "w")
fout.write(s)
fout.close()

In [None]:
for key in data.keys():
    clf = data[key]["clf"]
    scores = clf.scores_[True]
    print(key, np.mean(np.max(scores, axis=1)))

In [None]:
clf.coef_.shape

In [None]:
"""
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
z = pca.fit_transform(X)
plt.figure()
plt.scatter(z[:, 0], z[:, 1], c=y)


reducer = umap.UMAP(random_state=42)
reducer.fit(X)
z = reducer.transform(X)
plt.figure()
plt.scatter(z[:, 0], z[:, 1], c=y)
"""