In [4]:
import pandas as pd
import numpy as np
import os

from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from tqdm import tqdm, trange

In [5]:
test_path = "./agren_40_long/emb_test.csv"
train_path = "./agren_40_long/emb.csv"

save_path = "./agren_40_long/emb_class_acc.csv"
n = 40

In [6]:
df_test = pd.read_csv(test_path)
df_train = pd.read_csv(train_path)

labels = sorted(df_train['label'].unique())

df_train['label_n'] = df_train['label'].map(labels.index)
df_test['label_n'] = df_test['label'].map(labels.index)

classes = sorted(df_test['label_n'].unique())

In [7]:
df_train['label'].value_counts()

uterus,_post-menopause_-_glandular_cells_2      200
breast_-_adipocytes_2                           200
cerebral_cortex_-_neuropil_2                    200
breast_-_myoepithelial_cells_2                  200
ovary_-_ovarian_stroma_cells_2                  200
bone_marrow_-_hematopoietic_cells_2             200
kidney_-_cells_in_glomeruli_2                   200
fallopian_tube_-_glandular_cells_2              200
lung_-_pneumocytes_2                            200
small_intestine_-_glandular_cells_2             200
spleen_-_cells_in_white_pulp_2                  200
hippocampus_-_glial_cells_2                     200
seminal_vesicle_-_glandular_cells_2             200
bronchus_-_respiratory_epithelial_cells_2       200
uterus,_pre-menopause_-_glandular_cells_2       200
stomach,_lower_-_glandular_cells_2              200
kidney_-_cells_in_tubules_2                     200
urinary_bladder_-_urothelial_cells_2            200
nasopharynx_-_respiratory_epithelial_cells_2    200
vulva-anal_s

In [8]:
X_cols = sorted(filter(lambda c: "emb" in c, df_train.columns))
y_col = "label_n"

In [9]:
X_train = np.array(df_train[X_cols].values, dtype=float)
y_train = np.array(df_train[y_col].values, dtype=int)

X_test = np.array(df_test[X_cols].values, dtype=float)
y_test = np.array(df_test[y_col].values, dtype=int)

In [10]:
def get_classifier(c_type, max_depth=2):
    match c_type.lower():
        case "mlp":
            classifier = MLPClassifier()
        case "rf":
            classifier = RandomForestClassifier(max_depth=max_depth)
        case "nc":
            classifier = NearestCentroid()
        case other:
            raise NotImplemented
        
    classifier.fit(X_train, y_train)
    return classifier

def get_preds(c_type, max_depth=2, n=n):
    preds = []
    for _ in trange(n, desc=c_type):
        classifier = get_classifier(c_type, max_depth)
        y_pred = classifier.predict(X_test)
        preds.append(y_pred)
    return preds

In [11]:
c_preds = {
    "NC"   : get_preds("nc"),
    "MLP" : get_preds("mlp"),
    "RF-4"  : get_preds("rf", max_depth=4),
    "RF-2"  : get_preds("rf", max_depth=2),
}

print("Done!")

nc: 100%|██████████| 40/40 [00:01<00:00, 32.99it/s]
mlp:  72%|███████▎  | 29/40 [04:16<01:39,  9.06s/it]

In [None]:
acc_data = {
    c : []
    for c in c_preds
}
acc_data['label'] = []
acc_data['classifier_n'] = []

for i in trange(n):
    for label in labels:
        index = df_test['label'].values == label

        for c in c_preds:
            acc_data[c].append(np.mean(np.array(c_preds[c][i][index]) == y_test[index]))
        acc_data["label"].append(label)
        acc_data["classifier_n"] = i

acc_df = pd.DataFrame(acc_data)
acc_df

100%|██████████| 40/40 [00:00<00:00, 128.73it/s]


Unnamed: 0,NC,MLP,RF-4,RF-2,label,classifier_n
0,1.0,1.0,0.970588,0.823529,bone_marrow_-_hematopoietic_cells_2,39
1,1.0,1.0,1.000000,0.995098,breast_-_adipocytes_2,39
2,1.0,1.0,0.970588,0.931373,breast_-_myoepithelial_cells_2,39
3,1.0,1.0,0.941176,0.759804,bronchus_-_respiratory_epithelial_cells_2,39
4,1.0,1.0,1.000000,0.955882,cerebellum_-_cells_in_molecular_layer_2,39
...,...,...,...,...,...,...
1595,1.0,1.0,0.985294,0.980392,urinary_bladder_-_urothelial_cells_2,39
1596,1.0,1.0,0.985294,0.946078,"uterus,_post-menopause_-_glandular_cells_2",39
1597,1.0,1.0,0.950980,0.887255,"uterus,_pre-menopause_-_glandular_cells_2",39
1598,1.0,1.0,0.985294,0.843137,vagina_-_squamous_epithelial_cells_2,39


In [None]:
acc_df.to_csv(save_path, index=False)

In [None]:
print(labels)

['bone_marrow_-_hematopoietic_cells_2', 'breast_-_adipocytes_2', 'breast_-_myoepithelial_cells_2', 'bronchus_-_respiratory_epithelial_cells_2', 'cerebellum_-_cells_in_molecular_layer_2', 'cerebral_cortex_-_neuropil_2', 'cervix,_uterine_-_glandular_cells_2', 'colon_-_peripheral_nerve-ganglion_2', 'esophagus_-_squamous_epithelial_cells_2', 'fallopian_tube_-_glandular_cells_2', 'gallbladder_-_glandular_cells_2', 'heart_muscle_-_myocytes_2', 'hippocampus_-_glial_cells_2', 'kidney_-_cells_in_glomeruli_2', 'kidney_-_cells_in_tubules_2', 'lateral_ventricle_-_glial_cells_2', 'lateral_ventricle_-_neuronal_cells_2', 'liver_-_hepatocytes_2', 'lung_-_pneumocytes_2', 'nasopharynx_-_respiratory_epithelial_cells_2', 'ovary_-_follicle_cells_2', 'ovary_-_ovarian_stroma_cells_2', 'rectum_-_glandular_cells_2', 'salivary_gland_-_glandular_cells_2', 'seminal_vesicle_-_glandular_cells_2', 'skin_-_melanocytes_2', 'small_intestine_-_glandular_cells_2', 'soft_tissue_2_-_adipocytes_2', 'soft_tissue_2_-_fibrobla