In [1]:
import numpy as np
from numpy import linalg as LA
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics.pairwise import euclidean_distances
from scipy import stats
from sklearn.decomposition import PCA
import os
from sklearn.decomposition import PCA
from evaluation import *

In [2]:
# goal: choose alpha ST RT-zOGF using ms_df is highest
# load dfs
cwd = os.getcwd()
df = pd.read_csv(f"{cwd}/7-29_acc_df.csv")
mu_sig_df = pd.read_csv(f"{cwd}/mu_sig_df.csv")
baseline_df = df[df['algorithm'].isin(['pca', 'tsne', 'pacmap', 'umap'])]

In [None]:
# get mu, sigma
metrics = ['rt', 'knn']
mean_and_std = dict()
# metric way
for metric in metrics:
    metric_df = baseline_df[metric] 
    mu = metric_df.median() # use median
    sigma = metric_df.std()
    print(metric, "mu =", mu, "sigma =", sigma)
    print("median =", metric_df.median())
    mean_and_std[metric] = (mu, sigma)
    
    metric_zscores = (df[metric] - mu) / sigma # compute z-scores across all methods
    zdf[metric] = metric_zscores

zdf['ogf'] = 0.5*zdf['rt'] + 0.5*zdf['knn'] # compute RT-zOGF

In [19]:
# get mu, sigma for rt plus knn
rt_knn_df = baseline_df['knn'] + baseline_df['rt']
mu = rt_knn_df.median()
sigma = rt_knn_df.std()

In [3]:
# for each dataset:
# load PCA, UMAP embeddings of each dataset
# concatenate accordingly
# find RT, kNN of each embedding
# store RT, kNN, RT-kNN, and OGF in dataframe
# print out table showing alpha star versus other methods


# once have all the datasets:
# print out table showing ideal alpha for each dataset
# print out new table

# need dataframe storing: 
# algorithm (alpha size), dataset, RT, kNN, RT-kNN, OGF

### just to check, mini Muraro example ###

In [None]:
alpha_list = [str(np.round(item,1)) for item in np.linspace(0,1,11)] # this rounding is for step size 0.1 -- for step size 0.05, change it to 2!!!
#datasets = ['CAFs', 'CellMix', 'Duo4eq', 'Duo8eq','FMNIST','Kang','MNIST','Muraro','TMLung','TMPanc']
datasets = ['Muraro']
results = []

for dataset in datasets:
    # load PCA, UMAP embeddings
    pcaM = umapM = np.load(f'{cwd}/embeddings/'+ dataset + '/'+ dataset + '_pca.npy')
    umapM = np.load(f'{cwd}/embeddings/'+ dataset + '/'+ dataset + '_umap.npy')

    # load Xtrain, ytrain
    if dataset == 'CAFs':
        expr = pd.read_csv(f'{cwd}/data/CAFs.txt', sep='\t')
        X_train = expr.values[:,0:(expr.shape[1]-1)]
        X_train = np.log(X_train + 1)
        y_train = expr.values[:,expr.shape[1]-1]
    
    elif dataset =='CellMix':
        df = pd.read_csv(f'{cwd}/data/pcadata_CellMix.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]

    elif dataset =='CellMixLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_CellMix_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='Duo4eq':
        X_train = np.load(f"{cwd}/data/4eq_log_pca.npy")
        y_train = np.load(f"{cwd}/data/4eq_labels.npy")

    elif dataset =='Duo8eq':
        X_train = np.load(f"{cwd}/data/8eq_log_pca.npy")
        y_train = np.load(f"{cwd}/data/8eq_labels.npy")

    elif dataset == 'FMNIST':
        source_df = pd.read_csv(f"{cwd}/data/fashion-mnist.csv")
        X_train = source_df.iloc[:,:].values.astype(np.float32)
        y_train = source_df["class"].values 

    elif dataset =='Kang':
        X_train = np.load(f"{cwd}/data/kang_log_pca.npy")
        y_train = np.load(f"{cwd}/data/kang_labels.npy")

    elif dataset =='MNIST':
        mnist = ds.fetch_openml('mnist_784', version=1, as_frame=False)
        X_train = mnist.data
        y_train = mnist.target.astype(int)
    
    elif dataset == 'Muraro':
        X_train = np.load(f"{cwd}/data/muraro_log_pca.npy")
        y_train = np.load(f"{cwd}/data/muraro_labels.npy")

    elif dataset =='TMLung':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMLung.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='TMLungLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMLung_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]

    elif dataset =='TMPanc':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMPanc.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='TMPancLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMPanc_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
            
    # get and plot embeddings
    for alpha in alpha_list:
        a = float(alpha) # alpha as float

        # scale UMAP, PCA embeddings by alpha, (1-alpha) respectively
        umapM_sc = umapM * (a) # sc for scaled
        pcaM_sc = pcaM * (1-a)
        # concatenate
        concat_emb = np.concatenate((pcaM_sc,umapM_sc), axis=1)
        # reduce
        y = reducer.fit_transform(concat_emb)
        y = y / LA.norm(y) # normalize scale to 1   

        result = dict()
        result['dataset'] = dataset
        result['alpha'] = alpha


        rt_acc = np.zeros(5,)
        for i in range(5):
            rt_acc[i] = random_triplet_eval(X_train, embedding, y_train) #X_train, embedding, labels
        rt = np.mean(rt_acc)
        result['rt'] = rt

        knn = knn_eval(embedding, y_train, n_neighbors = 5)
        result['knn'] = knn

        result['rt-knn'] = (rt + knn)#/2 possibly

        ogf = (rt + knn - mu)/sigma
        result['ogf'] = ogf

        results.append(result)

df = pd.DataFrame(results)
df