In [1]:
import numpy as np
from numpy import linalg as LA
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics.pairwise import euclidean_distances
from scipy import stats
from sklearn.decomposition import PCA
import os
from sklearn import datasets as ds
from evaluation import *


In [7]:
from sklearn import datasets as ds

In [2]:
# goal: choose alpha ST RT-zOGF using ms_df is highest
# load dfs
cwd = os.getcwd()
mu_sig_df = pd.read_csv(f"{cwd}/mu_sig_df.csv")

In [3]:
rt_mu = mu_sig_df[mu_sig_df['metric']=='rt']['mean'].values[0]
rt_sig = mu_sig_df[mu_sig_df['metric']=='rt']['std'].values[0]
knn_mu = mu_sig_df[mu_sig_df['metric']=='knn']['mean'].values[0]
knn_sig = mu_sig_df[mu_sig_df['metric']=='knn']['std'].values[0]


In [3]:
# for each dataset:
# load PCA, UMAP embeddings of each dataset
# concatenate accordingly
# find RT, kNN of each embedding
# store RT, kNN, RT-kNN, and OGF in dataframe
# print out table showing alpha star versus other methods


# once have all the datasets:
# print out table showing ideal alpha for each dataset
# print out new table

# need dataframe storing: 
# algorithm (alpha size), dataset, RT, kNN, RT-kNN, OGF

### just to check, mini Muraro example ###

In [4]:
reducer = PCA(n_components=2, random_state=42)

In [8]:
alpha_list = [str(np.round(item,2)) for item in np.linspace(0,1,21)] # this rounding is for step size 0.1 -- for step size 0.05, change it to 2!!!
datasets = ['CAFs', 'CellMix', 'Duo4eq', 'Duo8eq','FMNIST','Kang','MNIST','Muraro','TMLung','TMPanc']
#datasets = ['Muraro']
results = []

for dataset in datasets:
    # load PCA, UMAP embeddings
    pcaM = umapM = np.load(f'{cwd}/embeddings/'+ dataset + '/'+ dataset + '_pca.npy')
    umapM = np.load(f'{cwd}/embeddings/'+ dataset + '/'+ dataset + '_umap.npy')

    # load Xtrain, ytrain
    if dataset == 'CAFs':
        expr = pd.read_csv(f'{cwd}/data/CAFs.txt', sep='\t')
        X_train = expr.values[:,0:(expr.shape[1]-1)]
        X_train = np.log(X_train + 1)
        y_train = expr.values[:,expr.shape[1]-1]
    
    elif dataset =='CellMix':
        df = pd.read_csv(f'{cwd}/data/pcadata_CellMix.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]

    elif dataset =='CellMixLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_CellMix_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='Duo4eq':
        X_train = np.load(f"{cwd}/data/4eq_log_pca.npy")
        y_train = np.load(f"{cwd}/data/4eq_labels.npy")

    elif dataset =='Duo8eq':
        X_train = np.load(f"{cwd}/data/8eq_log_pca.npy")
        y_train = np.load(f"{cwd}/data/8eq_labels.npy")

    elif dataset == 'FMNIST':
        source_df = pd.read_csv(f"{cwd}/data/fashion-mnist.csv")
        X_train = source_df.iloc[:,:].values.astype(np.float32)
        y_train = source_df["class"].values 

    elif dataset =='Kang':
        X_train = np.load(f"{cwd}/data/kang_log_pca.npy")
        y_train = np.load(f"{cwd}/data/kang_labels.npy")

    elif dataset =='MNIST':
        mnist = ds.fetch_openml('mnist_784', version=1, as_frame=False)
        X_train = mnist.data
        y_train = mnist.target.astype(int)
    
    elif dataset == 'Muraro':
        X_train = np.load(f"{cwd}/data/muraro_log_pca.npy")
        y_train = np.load(f"{cwd}/data/muraro_labels.npy")

    elif dataset =='TMLung':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMLung.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='TMLungLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMLung_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]

    elif dataset =='TMPanc':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMPanc.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
    
    elif dataset =='TMPancLA':
        df = pd.read_csv(f'{cwd}/data/pcadata_TMPanc_WithLocAvg.csv')
        X_train = df.values[:,0:(df.shape[1]-1)]
        y_train = df.values[:,df.shape[1]-1]
            
    # get and plot embeddings
    for alpha in alpha_list:
        a = float(alpha) # alpha as float

        # scale UMAP, PCA embeddings by alpha, (1-alpha) respectively
        umapM_sc = umapM * (a) # sc for scaled
        pcaM_sc = pcaM * (1-a)
        # concatenate
        concat_emb = np.concatenate((pcaM_sc,umapM_sc), axis=1)
        # reduce
        y = reducer.fit_transform(concat_emb)
        embedding = y / LA.norm(y) # normalize scale to 1   

        result = dict()
        result['dataset'] = dataset
        result['alpha'] = alpha

        # RT
        rt_acc = np.zeros(5,)
        for i in range(5):
            rt_acc[i] = random_triplet_eval(X_train, embedding, y_train) #X_train, embedding, labels
        rt = np.mean(rt_acc)
        result['rt'] = rt

        # kNN
        knn = knn_eval(embedding, y_train, n_neighbors = 5)
        result['knn'] = knn

        # OGF
        zrt = (rt - rt_mu)/rt_sig
        zknn = (knn - knn_mu)/knn_sig
        result['ogf'] = 0.5*zrt + 0.5*zknn

        results.append(result)

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,dataset,alpha,rt,knn,ogf
0,CAFs,0.0,0.789106,0.932961,0.619356
1,CAFs,0.05,0.780838,0.932961,0.553979
2,CAFs,0.1,0.785140,0.934358,0.592533
3,CAFs,0.15,0.782570,0.935754,0.576754
4,CAFs,0.2,0.785810,0.938547,0.611456
...,...,...,...,...,...
205,TMPanc,0.8,0.724598,0.971607,0.234926
206,TMPanc,0.85,0.720388,0.974377,0.210638
207,TMPanc,0.9,0.716842,0.975069,0.184854
208,TMPanc,0.95,0.708449,0.974377,0.116235


In [9]:
results_df.to_csv(f"{cwd}/big_alpha_df.csv",index=False)

### Latex Results ###

In [10]:
# make a table for each dataset
for d in datasets:
    tab = results_df[results_df['dataset'] == d]

    # Identify max values
    max_rt = tab['rt'].max()
    max_knn = tab['knn'].max()
    max_ogf = tab['ogf'].max()

    # Format values: bold max ones
    latex_rows = []
    for _, row in tab.iterrows():
        rt_val = f"\\textbf{{{row['rt']:.4f}}}" if row['rt'] == max_rt else f"{row['rt']:.4f}"
        knn_val = f"\\textbf{{{row['knn']:.4f}}}" if row['knn'] == max_knn else f"{row['knn']:.4f}"
        ogf_val = f"\\cellcolor{{hilite}}{row['ogf']:.4f}" if row['ogf'] == max_ogf else f"{row['ogf']:.4f}"
        latex_rows.append(f"& {row['alpha']} & {rt_val} & {knn_val} & {ogf_val} \\\\")

    # Combine into LaTeX tabular
    latex_table = "\\begin{tabular}{lccccc}\n \\textbf{{\\large " + d + "}} & $\\alpha$ & RT & kNN & OGF \\\\\n\\hline\n" + \
                "\n".join(latex_rows) + \
                "\n\\end{tabular}\\\\~\\\\"

    print(latex_table, '\n')

\begin{tabular}{lccccc}
 \textbf{{\large CAFs}} & $\alpha$ & RT & kNN & OGF \\
\hline
& 0.0 & 0.7891 & 0.9330 & 0.6194 \\
& 0.05 & 0.7808 & 0.9330 & 0.5540 \\
& 0.1 & 0.7851 & 0.9344 & 0.5925 \\
& 0.15 & 0.7826 & 0.9358 & 0.5768 \\
& 0.2 & 0.7858 & 0.9385 & 0.6115 \\
& 0.25 & 0.7909 & 0.9372 & 0.6476 \\
& 0.3 & 0.7880 & 0.9344 & 0.6155 \\
& 0.35 & 0.7933 & 0.9316 & 0.6479 \\
& 0.4 & 0.7885 & 0.9316 & 0.6100 \\
& 0.45 & 0.7920 & 0.9413 & 0.6691 \\
& 0.5 & 0.7854 & 0.9637 & 0.6901 \\
& 0.55 & 0.7927 & 0.9832 & 0.8111 \\
& 0.6 & 0.7915 & 0.9916 & 0.8291 \\
& 0.65 & 0.7930 & 0.9944 & 0.8496 \\
& 0.7 & \textbf{0.7944} & 0.9958 & \cellcolor{hilite}0.8656 \\
& 0.75 & 0.7850 & \textbf{0.9972} & 0.7960 \\
& 0.8 & 0.7882 & \textbf{0.9972} & 0.8207 \\
& 0.85 & 0.7859 & \textbf{0.9972} & 0.8030 \\
& 0.9 & 0.7827 & \textbf{0.9972} & 0.7774 \\
& 0.95 & 0.7757 & \textbf{0.9972} & 0.7222 \\
& 1.0 & 0.7761 & \textbf{0.9972} & 0.7253 \\
\end{tabular}\\~\\ 

\begin{tabular}{lccccc}
 \textbf{{\large CellM

In [41]:

#star_df = results_df.loc[[results_df['ogf'].idxmax()]]

In [11]:
# make df of alpha stars
star_df = pd.DataFrame()
for d in datasets:
    tab = results_df[results_df['dataset'] == d]
    star_df = pd.concat([star_df, tab.loc[[tab['ogf'].idxmax()]]], ignore_index=True)


In [12]:
star_df

Unnamed: 0,dataset,alpha,rt,knn,ogf
0,CAFs,0.7,0.794413,0.99581,0.865645
1,CellMix,0.05,0.815856,0.961277,0.922922
2,Duo4eq,0.0,0.787842,0.556585,-0.614251
3,Duo8eq,0.45,0.786169,0.615173,-0.437004
4,FMNIST,0.45,0.856462,0.563143,-0.050346
5,Kang,0.55,0.776075,0.908136,0.435616
6,MNIST,0.75,0.619818,0.961657,-0.625925
7,Muraro,0.45,0.773672,0.923313,0.465953
8,TMLung,0.1,0.838146,0.900662,0.902112
9,TMPanc,0.5,0.830055,0.894737,0.818878


In [13]:
star_df.to_csv(f"{cwd}/alpha_star_only_df.csv",index=False)

In [37]:
# table showing alpha star for each dataset -- can't get this one to work
latex_rows=[]
for d in datasets:
    tab = star_df[star_df['dataset'] == d]
    rt = tab['rt'].values[0]
    knn = tab['knn'].values[0]
    ogf = tab['ogf'].values[0]
    latex_rows.append(f" & {d} & {tab['alpha']} & {rt:.4f} & {knn:.4f} & {ogf:.4f} \\\\")

latex_table = "\\begin{tabular}{lccccc}\n & Dataset & $\\alpha*$ & RT & kNN & OGF \\\\\n\\hline\n" + \
            "\n".join(latex_rows) + \
            "\n\\end{tabular}\\\\~\\\\"

print(latex_table, '\n')

\begin{tabular}{lccccc}
 & Dataset & $\alpha*$ & RT & kNN & OGF \\
\hline
 & CAFs & 0    0.7
Name: alpha, dtype: object & 0.7944 & 0.9958 & 0.8656 \\
 & CellMix & 1    0.05
Name: alpha, dtype: object & 0.8159 & 0.9613 & 0.9229 \\
 & Duo4eq & 2    0.0
Name: alpha, dtype: object & 0.7878 & 0.5566 & -0.6143 \\
 & Duo8eq & 3    0.45
Name: alpha, dtype: object & 0.7862 & 0.6152 & -0.4370 \\
 & FMNIST & 4    0.45
Name: alpha, dtype: object & 0.8565 & 0.5631 & -0.0503 \\
 & Kang & 5    0.55
Name: alpha, dtype: object & 0.7761 & 0.9081 & 0.4356 \\
 & MNIST & 6    0.75
Name: alpha, dtype: object & 0.6198 & 0.9617 & -0.6259 \\
 & Muraro & 7    0.45
Name: alpha, dtype: object & 0.7737 & 0.9233 & 0.4660 \\
 & TMLung & 8    0.1
Name: alpha, dtype: object & 0.8381 & 0.9007 & 0.9021 \\
 & TMPanc & 9    0.5
Name: alpha, dtype: object & 0.8301 & 0.8947 & 0.8189 \\
\end{tabular}\\~\\ 



In [39]:
# table showing alpha star for each dataset -- this one works
latex_rows = []

for d in datasets:
    tab = results_df[results_df['dataset'] == d]
    max_ogf = tab['ogf'].max()
    alpha_star = tab.loc[tab['ogf'].idxmax(), 'alpha']
    
    latex_rows.append(f"& {d} & {alpha_star} & {max_ogf:.4f} \\\\")

# Combine into LaTeX tabular
latex_table = "\\begin{tabular}{lccccc}\n & Dataset & $\\alpha*$ & OGF \\\\\n\\hline\n" + \
            "\n".join(latex_rows) + \
            "\n\\end{tabular}\\\\~\\\\"

print(latex_table, '\n')

\begin{tabular}{lccccc}
 & Dataset & $\alpha*$ & OGF \\
\hline
& CAFs & 0.7 & 0.8656 \\
& CellMix & 0.05 & 0.9229 \\
& Duo4eq & 0.0 & -0.6143 \\
& Duo8eq & 0.45 & -0.4370 \\
& FMNIST & 0.45 & -0.0503 \\
& Kang & 0.55 & 0.4356 \\
& MNIST & 0.75 & -0.6259 \\
& Muraro & 0.45 & 0.4660 \\
& TMLung & 0.1 & 0.9021 \\
& TMPanc & 0.5 & 0.8189 \\
\end{tabular}\\~\\ 



In [42]:
zdf = pd.read_csv(f"{cwd}/8-7_RT-zOGF_df.csv")

In [46]:
baseline_zdf = zdf[zdf['algorithm'].isin(['pca', 'tsne', 'pacmap', 'umap'])]
baseline_zdf = baseline_zdf[['algorithm', 'dataset', 'ogf']]
star_df_algs = star_df.rename(columns={'alpha': 'algorithm'})
star_df_algs = star_df_algs[['algorithm', 'dataset', 'ogf']]

In [51]:
star_df_algs['algorithm'] = '$\\alpha$* = ' + star_df_algs['algorithm']

In [52]:
base_and_star = pd.concat([baseline_zdf,star_df_algs],ignore_index=True)

In [54]:
base_and_star.to_csv(f"{cwd}/baseline_and_alpha_star_df.csv",index=False)

In [68]:
base_and_star

Unnamed: 0,algorithm,dataset,ogf
0,pca,CAFs,0.610963
1,pca,CellMix,0.90468
2,pca,Duo4eq,-0.616468
3,pca,Duo8eq,-0.581712
4,pca,FMNIST,-0.146652
5,pca,Kang,-0.018019
6,pca,MNIST,-2.009746
7,pca,Muraro,-0.543417
8,pca,TMLung,0.885355
9,pca,TMPanc,0.67768


In [57]:
tab['ogf']

9     0.677680
19    0.266822
29    0.140986
39    0.141338
49    0.818878
Name: ogf, dtype: float64

In [74]:
print(tab['algorithm'])

0                 pca
10               tsne
20               umap
30             pacmap
40    $\alpha$* = 0.7
Name: algorithm, dtype: object


In [79]:
latex_rows = []
for d in datasets:
    tab = base_and_star[base_and_star['dataset'] == d]
    best_alg = tab.loc[tab['ogf'].idxmax(), 'algorithm']
    
    pca = tab.loc[tab['algorithm'] == 'pca', 'ogf'].item()
    umap = tab.loc[tab['algorithm'] == 'umap', 'ogf'].item()
    tsne = tab.loc[tab['algorithm'] == 'tsne', 'ogf'].item()
    pacmap = tab.loc[tab['algorithm'] == 'pacmap', 'ogf'].item()
    hybrid = tab.loc[tab['algorithm'].apply(lambda alg: alg.startswith("$")), 'ogf'].item()

    pca = f"\\cellcolor{{hilite}}{pca}" if best_alg=='pca' else f"{pca:.4f}"
    

    latex_rows.append(f"{d} & {pca} & {umap:.4f} & {tsne:.4f}  & {pacmap:.4f} & {hybrid:.4f}" )

latex_table = "\\begin{tabular}{lcccccc}\n Dataset & PCA & UMAP & t-SNE & PaCMAP & $\\alpha$* \\\\\n\\hline\n" + \
                "\n".join(latex_rows) + \
                "\n\\end{tabular}\\\\~\\\\"

print(latex_table, '\n')

\begin{tabular}{lcccccc}
 Dataset & PCA & UMAP & t-SNE & PaCMAP & $\alpha$* \\
\hline
CAFs & 0.6110 & 0.7191 & 0.7004  & 0.8215 & 0.8656
CellMix & 0.9047 & -0.0250 & 0.4995  & 0.8934 & 0.9229
Duo4eq & -0.6165 & -0.8070 & -0.7344  & -0.0559 & -0.6143
Duo8eq & -0.5817 & -0.9571 & -0.6280  & 0.0310 & -0.4370
FMNIST & -0.1467 & -0.3839 & -0.0324  & -0.3945 & -0.0503
Kang & -0.0180 & 0.1223 & 0.4096  & 0.6821 & 0.4356
MNIST & -2.0097 & -0.6585 & -0.5660  & -0.6887 & -0.6259
Muraro & -0.5434 & -0.5091 & 0.0426  & 0.3380 & 0.4660
TMLung & 0.8854 & 0.2952 & 0.6904  & 0.4835 & 0.9021
TMPanc & 0.6777 & 0.1410 & 0.2668  & 0.1413 & 0.8189
\end{tabular}\\~\\ 



In [None]:
# table showing OGF of alpha star compared to baselines
for d in datasets:
    tab = base_and_star[base_and_star['dataset'] == d]

    # Identify max values
    max_ogf = tab['ogf'].max()

    # Format values: bold max ones
    latex_rows = []
    for _, row in tab.iterrows():
        ogf_val = f"\\cellcolor{{hilite}}{row['ogf']:.4f}" if row['ogf'] == max_ogf else f"{row['ogf']:.4f}"
        latex_rows.append(f"& {row['algorithm']} & {rt_val} & {centr_val} & {knn_val} & {ogf_val} \\\\")

    # Combine into LaTeX tabular
    latex_table = "\\begin{tabular}{lccccc}\n \\textbf{{\\large " + d + "}} & Algorithm & RT & CT & kNN & OGF \\\\\n\\hline\n" + \
                "\n".join(latex_rows) + \
                "\n\\end{tabular}\\\\~\\\\"

    print(latex_table, '\n')