In [62]:
from Bio import SeqIO
def records(path):
    records = list(SeqIO.parse(path, "fasta"))
    return records

In [63]:
def ids_from_gopredsim_annotation(pth):
    with open(pth, 'r') as opened:
        ids = set(x.split()[0] for x in opened.readlines())
        print(len(ids))
    return ids

# Determine the sets of train and test ids

This is the training set originally provided in the CAFA3 challenge, although use could also use other information

In [64]:
orig_cafa3_file = "uniprot_sprot_exp.fasta"
orig_cafa3_ids_set = set(str(x.id) for x in records(orig_cafa3_file))
len(orig_cafa3_ids_set)

66841

This is the training set prepared by goPredSim developers,
closely matching the CAFA3 set, with a corresponding temporal cutoff

In [65]:
gopredsim_file = "goa_annotations_exp_2017.txt"
gopredsim_ids_set = ids_from_gopredsim_annotation(gopredsim_file)

68038


We take their intersection

In [66]:
train_ids = orig_cafa3_ids_set.intersection(gopredsim_ids_set)

In [67]:
len(train_ids)

62626

For the test set, we take the set of proteins which got new annotations in the CAFA3 review period

In [68]:
test_ids = set(str(x.id) for x in records("cafa3_targets.fasta"))

In [69]:
len(test_ids)

3328

# Prepare the sets of embeddings in format required by goPredSim

In [70]:
embeddings_folder = "prepared_embeddings"

In [71]:
import numpy as np
import os
def get_dict_from_separate_files(embeddings_file, ids_file):
    embeddings = np.load(embeddings_file, allow_pickle=True)
    ids = np.load(ids_file, allow_pickle=True)
    embed_dict = {id_: em for id_, em in zip(ids, embeddings)}
    return embed_dict

In [72]:
def get_subset(source, ids, allow_missing=False):
    res = dict()
    for i in ids:
        try:
            res[i]=source[i]
        except KeyError as e:
            print(f"missing embedding for {i}")
            if not allow_missing:
                raise e
    return res

In [73]:
import pickle
def save_dict(di, name):
    with open(f"{name}.pkl", 'wb') as fp:
        pickle.dump(di, fp)

In [74]:
def prepare_separate(model, split):
    assert split in ("train", "test")
    model_data_ids = os.path.join(embeddings_folder, f"{model}_{split}_ids.npy")
    model_data_ems = os.path.join(embeddings_folder, f"{model}_{split}_embeddings.npy")
    model_data_full = get_dict_from_separate_files(model_data_ems, model_data_ids)
    ids = train_ids if split == "train" else test_ids
    model_data = get_subset(model_data_full, ids)
    save_dict(model_data, f"{model}.{split}")

In [75]:
import h5py
import numpy as np
import tqdm

# Code based on that of goPredSim
def read_h5_embeddings(embeddings_in):
    """A2ASS6
    Read embeddings from h5 file generated by bio_embeddings pipeline
    :param embeddings_in: 
    :return: 
    """
    embeddings = dict()
    with h5py.File(embeddings_in, 'r') as f:
        for key, embedding in tqdm.tqdm(f.items()):
            original_id = embedding.attrs['original_id']
            embeddings[original_id] = np.array(embedding)
            
    return embeddings

In [76]:
def prepare_from_h5(model, split, allow_missing=False):
    assert split in ("train", "test")
    if split=="train":
        data_path = os.path.join(embeddings_folder, f"{model}_goa_2017.h5")
    else:
        data_path = os.path.join(embeddings_folder, f"{model}_cafa3_targets.h5")
        
    model_data_full = read_h5_embeddings(data_path)
    ids = train_ids if split == "train" else test_ids
    model_data = get_subset(model_data_full, ids, allow_missing=allow_missing)
    save_dict(model_data, f"{model}.{split}")

proteinbert

In [48]:
prepare_separate("proteinbert", "train")
prepare_separate("proteinbert", "test")

protbert

In [49]:
prepare_separate("protbert", "train")
prepare_separate("protbert", "test")

esm2

In [13]:
import pickle
def esm2_filter(file):
    with open(file, 'rb') as opened:
        di = pickle.load(opened)
    out = {i: o[33] for i, o in di.items()}
    with open(file, 'wb') as fp:
        pickle.dump(out, fp)

In [50]:
prepare_separate("esm2", "train")
prepare_separate("esm2", "test")

In [25]:
import pickle
def esm2_filter2(file):
    with open(file, 'rb') as opened:
        di = pickle.load(opened)
    out = {i: o.numpy() for i, o in di.items()}
    with open(file, 'wb') as fp:
        pickle.dump(out, fp)

In [17]:
esm2_filter("esm2.train.pkl")
esm2_filter("esm2.test.pkl")

In [None]:
esm2_filter2("esm2.train.pkl")
esm2_filter2("esm2.test.pkl")

prott5

In [76]:
prepare_from_h5("prott5", "train", allow_missing=True)

100%|██████████| 307278/307278 [01:27<00:00, 3501.78it/s]


missing embedding for A2ASS6
missing embedding for G4SLH0
missing embedding for Q9I7U4
missing embedding for Q8WZ42
missing embedding for Q8WXI7
missing embedding for Q09165


In [77]:
prepare_from_h5("prott5", "test")

100%|██████████| 3328/3328 [00:00<00:00, 3546.41it/s]


seqvec

In [79]:
prepare_from_h5("seqvec", "train", allow_missing=True)
prepare_from_h5("seqvec", "test")

100%|██████████| 307278/307278 [01:32<00:00, 3330.10it/s]


missing embedding for A2ASS6
missing embedding for G4SLH0
missing embedding for Q9I7U4
missing embedding for Q8WZ42
missing embedding for Q8WXI7
missing embedding for Q09165


100%|██████████| 3328/3328 [00:01<00:00, 2725.63it/s]


preexisting protbert

In [77]:
prepare_from_h5("theirprotbert", "train", allow_missing=True)
prepare_from_h5("theirprotbert", "test")

100%|██████████| 307278/307278 [01:35<00:00, 3212.55it/s]


missing embedding for Q09165
missing embedding for Q9I7U4
missing embedding for Q8WZ42
missing embedding for G4SLH0
missing embedding for A2ASS6
missing embedding for Q8WXI7


100%|██████████| 3328/3328 [00:01<00:00, 3231.66it/s]


In [63]:
import pickle
def load(file):
    with open(file, 'rb') as opened:
        di = pickle.load(opened)
        return di

In [64]:
x = load("project/theirprotbert.test.pkl")

In [66]:
import numpy as np
lens = [np.linalg.norm(arr) for arr in x.values()]

In [67]:
lens

[1.9333407,
 1.7303805,
 1.4600827,
 1.5873901,
 1.3586987,
 1.4717962,
 1.6300994,
 1.6050678,
 1.7198772,
 1.7099538,
 1.551347,
 1.7179743,
 1.5142007,
 2.2130406,
 1.169941,
 1.6037604,
 1.7348386,
 2.2893074,
 1.6679951,
 1.8446026,
 1.8347366,
 1.7651032,
 1.6979343,
 2.1512818,
 1.8470639,
 1.8877944,
 1.9986861,
 1.7125064,
 1.4372009,
 1.9364086,
 1.998567,
 1.5888938,
 1.9102566,
 1.7622368,
 1.2822216,
 1.6654811,
 1.4809779,
 3.051137,
 1.3258262,
 3.2046516,
 1.4510617,
 2.0072434,
 2.419381,
 1.4125732,
 2.0263689,
 1.7718647,
 1.9581747,
 1.3034966,
 1.6556896,
 2.0884104,
 1.9208376,
 1.6679215,
 1.7957593,
 1.5060405,
 1.8769077,
 2.1537368,
 2.0325866,
 2.0905266,
 1.9344875,
 1.5757715,
 1.3635778,
 1.7496706,
 1.5321747,
 1.8782963,
 2.0566115,
 1.6598191,
 2.0862577,
 1.7946155,
 2.394747,
 1.4561296,
 1.2275468,
 2.257842,
 1.9462242,
 2.1134107,
 1.3143905,
 1.2325059,
 1.3569176,
 1.6312116,
 1.7050123,
 1.5559263,
 2.4675303,
 1.7414445,
 1.8890586,
 1.4374515,

In [92]:
next(iter(load("project/theirprotbert.test.pkl").values())).shape

(1024,)

In [93]:
next(iter(load("project/protbert.test.pkl").values())).shape

(1024,)

In [80]:
# Prepare the set of annotations for our train set

In [83]:
def filter_annotations(annotations_file, id_set, output_file):
    with open(annotations_file, 'r') as in_file:
        with open(output_file, 'w') as out_file:
            for line in in_file.readlines():
                line_id = line.split()[0]
                if line_id in id_set:
                    print(line, file=out_file, end='') #line already has newline

In [84]:
filter_annotations("goa_annotations_exp_2017.txt", train_ids, "project_annotations.txt")

In [83]:
def produce_configs():
    #os.mkdir("configs")
    for model in ("theirprotbert",):#"prott5", "seqvec", "esm2", "protbert", "proteinbert":
        with open(os.path.join("configs", f"{model}_config.txt"), 'w') as config_file:
            print("go: data/GO/go_cafa3.obo", file=config_file)
            print(f"lookup_set: project/{model}.train.pkl", file=config_file)
            print(f"annotations: project_annotations.txt", file= config_file)
            print(f"targets: project/{model}.test.pkl", file=config_file)
            print(f"onto: all", file=config_file)
            print(f"thresh: 1", file=config_file)
            print(f"modus: num", file=config_file)
            print(f"output: results/{model}", file=config_file)

In [84]:
produce_configs()

In [33]:
from yaml import load, dump

In [40]:
import os
import yaml

In [85]:
def produce_assess():
    #os.mkdir("assess_configs")
    for model in ("theirprotbert",):#"prott5", "seqvec", "esm2", "protbert", "proteinbert":
        for onto in "BPO", "MFO", "CCO": 
            d = {
                "file" : f"predictions/{model}-Tch_1_all_go_{onto}.txt",
                "obo": "./precrec/go_cafa3.obo",
                "benchmark": "./precrec/benchmark/CAFA3_benchmarks/",
                "results": "./results"
            }
            conf = yaml.dump({"assess": d, "plot": {}})
            with open(f"assess_configs/{model}_{onto}.yaml", 'w') as opened:
                print(conf, file=opened, end='')

In [86]:
produce_assess()

In [8]:
#this cell has been created with chatgpt
import os
import pandas as pd

def parse_result_file(file_path):
    result_data = {}
    ontology_matched = False
    model = file_path.split("_")[1]
    ontology = file_path.split("_")[-1].split(".")[0].upper()

    with open(file_path, 'r') as file:
        lines = file.readlines()

    mode = None
    benchmark_type = None

    for line in lines:
        line = line.strip()
        if line.startswith("ontology:"):
            current_ontology = line.split(":")[1].strip().upper()
            if current_ontology == ontology:
                ontology_matched = True
            else:
                ontology_matched = False
        elif ontology_matched:
            if line.startswith("mode:"):
                mode = line.split(":")[1].strip()
            elif line.startswith("benchmark type:"):
                benchmark_type = line.split(":")[1].strip()
            elif line.startswith("fmax:") and mode == "full":
                fmax = float(line.split(":")[1].strip())
                key = (benchmark_type, ontology, model)
                result_data.setdefault(key, {})['fmax'] = fmax
            elif line.startswith("threshold giving fmax:") and mode == "full":
                threshold = float(line.split(":")[1].strip())
                key = (benchmark_type, ontology, model)
                result_data.setdefault(key, {})['threshold'] = threshold

    return result_data


# Iterate over all files in the output directory
output_dir = "assessment_output-euclidean"
fmax_results = []
threshold_results = []

for file_name in os.listdir(output_dir):
    file_path = os.path.join(output_dir, file_name)

    # Parse the result file
    result_data = parse_result_file(file_path)

    # Store the extracted data in a list of dictionaries
    if result_data:
        fmaxes = {key: data['fmax'] for key, data in result_data.items() if 'threshold' in data}
        fmax_results.extend(fmaxes.items())

        # Extract the threshold data
        thresholds = {key: data['threshold'] for key, data in result_data.items() if 'threshold' in data}
        threshold_results.extend(thresholds.items())

# Create a DataFrame for fmax values
fmax_df = pd.DataFrame(fmax_results, columns=['benchmark_type_ontology_model', 'results'])
fmax_df[['benchmark_type', 'ontology', 'model']] = pd.DataFrame(fmax_df['benchmark_type_ontology_model'].tolist(), index=fmax_df.index)
fmax_df = fmax_df.drop('benchmark_type_ontology_model', axis=1)

# Pivot the fmax DataFrame
fmax_pivot = fmax_df.pivot(index='model', columns=['benchmark_type', 'ontology'], values='results')

# Create a DataFrame for thresholds
threshold_df = pd.DataFrame(threshold_results, columns=['benchmark_type_ontology_model', 'threshold'])
threshold_df[['benchmark_type', 'ontology', 'model']] = pd.DataFrame(threshold_df['benchmark_type_ontology_model'].tolist(), index=threshold_df.index)
threshold_df = threshold_df.drop('benchmark_type_ontology_model', axis=1)

# Pivot the threshold DataFrame
threshold_pivot = threshold_df.pivot(index='model', columns=['benchmark_type', 'ontology'], values='threshold')

# Print the fmax DataFrame
print("Fmax DataFrame:")
print(fmax_pivot)

# Print the threshold DataFrame
print("\nThreshold DataFrame:")
print(threshold_pivot)

euclidean_df = fmax_pivot

Fmax DataFrame:
benchmark_type                        NK        LK        NK        LK  \
ontology                             BPO       BPO       CCO       CCO   
model                                                                    
output-euclidean/esm2           0.320937  0.340213  0.575237  0.570138   
output-euclidean/protbert       0.260052  0.315406  0.538194  0.537135   
output-euclidean/proteinbert    0.311029  0.362275  0.571419  0.543442   
output-euclidean/prott5         0.324303  0.331968  0.588554  0.566115   
output-euclidean/seqvec         0.305571  0.292032  0.561114  0.540299   
output-euclidean/theirprotbert  0.297858  0.330947  0.567013  0.538607   

benchmark_type                        NK        LK  
ontology                             MFO       MFO  
model                                               
output-euclidean/esm2           0.504384  0.441647  
output-euclidean/protbert       0.398696  0.348051  
output-euclidean/proteinbert    0.511744  0.451957  

In [9]:
#this cell has been created with chatgpt
import os
import pandas as pd

def parse_result_file(file_path):
    result_data = {}
    ontology_matched = False
    model = file_path.split("_")[1]
    ontology = file_path.split("_")[-1].split(".")[0].upper()

    with open(file_path, 'r') as file:
        lines = file.readlines()

    mode = None
    benchmark_type = None

    for line in lines:
        line = line.strip()
        if line.startswith("ontology:"):
            current_ontology = line.split(":")[1].strip().upper()
            if current_ontology == ontology:
                ontology_matched = True
            else:
                ontology_matched = False
        elif ontology_matched:
            if line.startswith("mode:"):
                mode = line.split(":")[1].strip()
            elif line.startswith("benchmark type:"):
                benchmark_type = line.split(":")[1].strip()
            elif line.startswith("fmax:") and mode == "full":
                fmax = float(line.split(":")[1].strip())
                key = (benchmark_type, ontology, model)
                result_data.setdefault(key, {})['fmax'] = fmax
            elif line.startswith("threshold giving fmax:") and mode == "full":
                threshold = float(line.split(":")[1].strip())
                key = (benchmark_type, ontology, model)
                result_data.setdefault(key, {})['threshold'] = threshold

    return result_data


# Iterate over all files in the output directory
output_dir = "assessment_output-cosine"
fmax_results = []
threshold_results = []

for file_name in os.listdir(output_dir):
    file_path = os.path.join(output_dir, file_name)

    # Parse the result file
    result_data = parse_result_file(file_path)

    # Store the extracted data in a list of dictionaries
    if result_data:
        fmaxes = {key: data['fmax'] for key, data in result_data.items() if 'threshold' in data}
        fmax_results.extend(fmaxes.items())

        # Extract the threshold data
        thresholds = {key: data['threshold'] for key, data in result_data.items() if 'threshold' in data}
        threshold_results.extend(thresholds.items())

fmax_results
# Create a DataFrame for fmax values
fmax_df = pd.DataFrame(fmax_results, columns=['benchmark_type_ontology_model', 'results'])
fmax_df[['benchmark_type', 'ontology', 'model']] = pd.DataFrame(fmax_df['benchmark_type_ontology_model'].tolist(), index=fmax_df.index)
fmax_df = fmax_df.drop('benchmark_type_ontology_model', axis=1)

# Pivot the fmax DataFrame
fmax_pivot = fmax_df.pivot(index='model', columns=['benchmark_type', 'ontology'], values='results')

# Create a DataFrame for thresholds
threshold_df = pd.DataFrame(threshold_results, columns=['benchmark_type_ontology_model', 'threshold'])
threshold_df[['benchmark_type', 'ontology', 'model']] = pd.DataFrame(threshold_df['benchmark_type_ontology_model'].tolist(), index=threshold_df.index)
threshold_df = threshold_df.drop('benchmark_type_ontology_model', axis=1)

# Pivot the threshold DataFrame
threshold_pivot = threshold_df.pivot(index='model', columns=['benchmark_type', 'ontology'], values='threshold')

# Print the fmax DataFrame
print("Fmax DataFrame:")
print(fmax_pivot)

# Print the threshold DataFrame
print("\nThreshold DataFrame:")
print(threshold_pivot)
cosine_df = fmax_pivot

Fmax DataFrame:
benchmark_type                     NK        LK        NK        LK        NK  \
ontology                          BPO       BPO       CCO       CCO       MFO   
model                                                                           
output-cosine/esm2           0.319165  0.343183  0.583947  0.563275  0.510816   
output-cosine/protbert       0.264061  0.314596  0.537240  0.534392  0.397974   
output-cosine/proteinbert    0.308999  0.367301  0.572998  0.536269  0.522469   
output-cosine/prott5         0.321928  0.332805  0.586196  0.562986  0.541229   
output-cosine/seqvec         0.311946  0.298345  0.558143  0.532568  0.521211   
output-cosine/theirprotbert  0.297631  0.329151  0.561851  0.545227  0.470573   

benchmark_type                     LK  
ontology                          MFO  
model                                  
output-cosine/esm2           0.454607  
output-cosine/protbert       0.350704  
output-cosine/proteinbert    0.444916  
output-cosine/

In [10]:
euclidean_df

benchmark_type,NK,LK,NK,LK,NK,LK
ontology,BPO,BPO,CCO,CCO,MFO,MFO
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
output-euclidean/esm2,0.320937,0.340213,0.575237,0.570138,0.504384,0.441647
output-euclidean/protbert,0.260052,0.315406,0.538194,0.537135,0.398696,0.348051
output-euclidean/proteinbert,0.311029,0.362275,0.571419,0.543442,0.511744,0.451957
output-euclidean/prott5,0.324303,0.331968,0.588554,0.566115,0.531228,0.462393
output-euclidean/seqvec,0.305571,0.292032,0.561114,0.540299,0.496337,0.421948
output-euclidean/theirprotbert,0.297858,0.330947,0.567013,0.538607,0.472835,0.431698


In [11]:
cosine_df

benchmark_type,NK,LK,NK,LK,NK,LK
ontology,BPO,BPO,CCO,CCO,MFO,MFO
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
output-cosine/esm2,0.319165,0.343183,0.583947,0.563275,0.510816,0.454607
output-cosine/protbert,0.264061,0.314596,0.53724,0.534392,0.397974,0.350704
output-cosine/proteinbert,0.308999,0.367301,0.572998,0.536269,0.522469,0.444916
output-cosine/prott5,0.321928,0.332805,0.586196,0.562986,0.541229,0.457018
output-cosine/seqvec,0.311946,0.298345,0.558143,0.532568,0.521211,0.428593
output-cosine/theirprotbert,0.297631,0.329151,0.561851,0.545227,0.470573,0.418304


In [45]:
combined = pd.concat([euclidean_df, cosine_df])

In [46]:
combined

benchmark_type,NK,LK,NK,LK,NK,LK
ontology,BPO,BPO,CCO,CCO,MFO,MFO
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
output-euclidean/esm2,0.320937,0.340213,0.575237,0.570138,0.504384,0.441647
output-euclidean/protbert,0.260052,0.315406,0.538194,0.537135,0.398696,0.348051
output-euclidean/proteinbert,0.311029,0.362275,0.571419,0.543442,0.511744,0.451957
output-euclidean/prott5,0.324303,0.331968,0.588554,0.566115,0.531228,0.462393
output-euclidean/seqvec,0.305571,0.292032,0.561114,0.540299,0.496337,0.421948
output-euclidean/theirprotbert,0.297858,0.330947,0.567013,0.538607,0.472835,0.431698
output-cosine/esm2,0.319165,0.343183,0.583947,0.563275,0.510816,0.454607
output-cosine/protbert,0.264061,0.314596,0.53724,0.534392,0.397974,0.350704
output-cosine/proteinbert,0.308999,0.367301,0.572998,0.536269,0.522469,0.444916
output-cosine/prott5,0.321928,0.332805,0.586196,0.562986,0.541229,0.457018


In [47]:
combined.columns

MultiIndex([('NK', 'BPO'),
            ('LK', 'BPO'),
            ('NK', 'CCO'),
            ('LK', 'CCO'),
            ('NK', 'MFO'),
            ('LK', 'MFO')],
           names=['benchmark_type', 'ontology'])

In [48]:
name_mapping = {name: name[len("output-"):] for name in combined.index}
for d in "euclidean", "cosine":
    name_mapping[f"output-{d}/protbert"] = f"{d}/protbert-ur100"
    name_mapping[f"output-{d}/theirprotbert"] = f"{d}/protbert-bfd"
combined = combined.rename(index=name_mapping)

In [49]:
combined

benchmark_type,NK,LK,NK,LK,NK,LK
ontology,BPO,BPO,CCO,CCO,MFO,MFO
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
euclidean/esm2,0.320937,0.340213,0.575237,0.570138,0.504384,0.441647
euclidean/protbert-ur100,0.260052,0.315406,0.538194,0.537135,0.398696,0.348051
euclidean/proteinbert,0.311029,0.362275,0.571419,0.543442,0.511744,0.451957
euclidean/prott5,0.324303,0.331968,0.588554,0.566115,0.531228,0.462393
euclidean/seqvec,0.305571,0.292032,0.561114,0.540299,0.496337,0.421948
euclidean/protbert-bfd,0.297858,0.330947,0.567013,0.538607,0.472835,0.431698
cosine/esm2,0.319165,0.343183,0.583947,0.563275,0.510816,0.454607
cosine/protbert-ur100,0.264061,0.314596,0.53724,0.534392,0.397974,0.350704
cosine/proteinbert,0.308999,0.367301,0.572998,0.536269,0.522469,0.444916
cosine/prott5,0.321928,0.332805,0.586196,0.562986,0.541229,0.457018


In [50]:
combined_lk =  combined.loc[:, combined.columns.get_level_values(0) == 'LK']
combined_lk.columns = combined_lk.columns.droplevel(0)

In [51]:
combined_lk

ontology,BPO,CCO,MFO
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
euclidean/esm2,0.340213,0.570138,0.441647
euclidean/protbert-ur100,0.315406,0.537135,0.348051
euclidean/proteinbert,0.362275,0.543442,0.451957
euclidean/prott5,0.331968,0.566115,0.462393
euclidean/seqvec,0.292032,0.540299,0.421948
euclidean/protbert-bfd,0.330947,0.538607,0.431698
cosine/esm2,0.343183,0.563275,0.454607
cosine/protbert-ur100,0.314596,0.534392,0.350704
cosine/proteinbert,0.367301,0.536269,0.444916
cosine/prott5,0.332805,0.562986,0.457018


In [52]:
combined_nk =  combined.loc[:, combined.columns.get_level_values(0) == 'NK']
combined_nk.columns = combined_nk.columns.droplevel(0)

In [53]:
combined_nk

ontology,BPO,CCO,MFO
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
euclidean/esm2,0.320937,0.575237,0.504384
euclidean/protbert-ur100,0.260052,0.538194,0.398696
euclidean/proteinbert,0.311029,0.571419,0.511744
euclidean/prott5,0.324303,0.588554,0.531228
euclidean/seqvec,0.305571,0.561114,0.496337
euclidean/protbert-bfd,0.297858,0.567013,0.472835
cosine/esm2,0.319165,0.583947,0.510816
cosine/protbert-ur100,0.264061,0.53724,0.397974
cosine/proteinbert,0.308999,0.572998,0.522469
cosine/prott5,0.321928,0.586196,0.541229


In [55]:
print(combined_lk.to_latex())

\begin{tabular}{lrrr}
\toprule
ontology &       BPO &       CCO &       MFO \\
model                    &           &           &           \\
\midrule
euclidean/esm2           &  0.340213 &  0.570138 &  0.441647 \\
euclidean/protbert-ur100 &  0.315406 &  0.537135 &  0.348051 \\
euclidean/proteinbert    &  0.362275 &  0.543442 &  0.451957 \\
euclidean/prott5         &  0.331968 &  0.566115 &  0.462393 \\
euclidean/seqvec         &  0.292032 &  0.540299 &  0.421948 \\
euclidean/protbert-bfd   &  0.330947 &  0.538607 &  0.431698 \\
cosine/esm2              &  0.343183 &  0.563275 &  0.454607 \\
cosine/protbert-ur100    &  0.314596 &  0.534392 &  0.350704 \\
cosine/proteinbert       &  0.367301 &  0.536269 &  0.444916 \\
cosine/prott5            &  0.332805 &  0.562986 &  0.457018 \\
cosine/seqvec            &  0.298345 &  0.532568 &  0.428593 \\
cosine/protbert-bfd      &  0.329151 &  0.545227 &  0.418304 \\
\bottomrule
\end{tabular}



  print(combined_lk.to_latex())


In [56]:
print(combined_nk.to_latex())

\begin{tabular}{lrrr}
\toprule
ontology &       BPO &       CCO &       MFO \\
model                    &           &           &           \\
\midrule
euclidean/esm2           &  0.320937 &  0.575237 &  0.504384 \\
euclidean/protbert-ur100 &  0.260052 &  0.538194 &  0.398696 \\
euclidean/proteinbert    &  0.311029 &  0.571419 &  0.511744 \\
euclidean/prott5         &  0.324303 &  0.588554 &  0.531228 \\
euclidean/seqvec         &  0.305571 &  0.561114 &  0.496337 \\
euclidean/protbert-bfd   &  0.297858 &  0.567013 &  0.472835 \\
cosine/esm2              &  0.319165 &  0.583947 &  0.510816 \\
cosine/protbert-ur100    &  0.264061 &  0.537240 &  0.397974 \\
cosine/proteinbert       &  0.308999 &  0.572998 &  0.522469 \\
cosine/prott5            &  0.321928 &  0.586196 &  0.541229 \\
cosine/seqvec            &  0.311946 &  0.558143 &  0.521211 \\
cosine/protbert-bfd      &  0.297631 &  0.561851 &  0.470573 \\
\bottomrule
\end{tabular}



  print(combined_nk.to_latex())


In [59]:
repr(combined_lk.to_dict())

"{'BPO': {'euclidean/esm2': 0.3402128490521757, 'euclidean/protbert-ur100': 0.31540560610418666, 'euclidean/proteinbert': 0.36227507914950674, 'euclidean/prott5': 0.33196811496519074, 'euclidean/seqvec': 0.2920315408988826, 'euclidean/protbert-bfd': 0.33094730264537603, 'cosine/esm2': 0.3431828174196418, 'cosine/protbert-ur100': 0.3145956336279498, 'cosine/proteinbert': 0.36730118795227124, 'cosine/prott5': 0.3328045729998566, 'cosine/seqvec': 0.2983453828330413, 'cosine/protbert-bfd': 0.3291509340108948}, 'CCO': {'euclidean/esm2': 0.5701380876505326, 'euclidean/protbert-ur100': 0.5371354196185177, 'euclidean/proteinbert': 0.5434421350680555, 'euclidean/prott5': 0.5661153459122396, 'euclidean/seqvec': 0.540298933471445, 'euclidean/protbert-bfd': 0.5386074093409604, 'cosine/esm2': 0.56327549723111, 'cosine/protbert-ur100': 0.5343917945928709, 'cosine/proteinbert': 0.5362689622488322, 'cosine/prott5': 0.5629856555171122, 'cosine/seqvec': 0.5325684029490236, 'cosine/protbert-bfd': 0.54522

In [60]:
repr(combined_nk.to_dict())

"{'BPO': {'euclidean/esm2': 0.320937030946012, 'euclidean/protbert-ur100': 0.26005151322540027, 'euclidean/proteinbert': 0.3110287219554749, 'euclidean/prott5': 0.3243030066201612, 'euclidean/seqvec': 0.3055714646633119, 'euclidean/protbert-bfd': 0.29785826432243917, 'cosine/esm2': 0.3191653498509426, 'cosine/protbert-ur100': 0.26406124274401904, 'cosine/proteinbert': 0.3089987882973226, 'cosine/prott5': 0.3219275394528631, 'cosine/seqvec': 0.31194621384746163, 'cosine/protbert-bfd': 0.2976306478121041}, 'CCO': {'euclidean/esm2': 0.5752366600780419, 'euclidean/protbert-ur100': 0.5381941944811517, 'euclidean/proteinbert': 0.5714186618318124, 'euclidean/prott5': 0.5885544854825134, 'euclidean/seqvec': 0.5611140121392842, 'euclidean/protbert-bfd': 0.5670125831368363, 'cosine/esm2': 0.5839474233111744, 'cosine/protbert-ur100': 0.5372396511878191, 'cosine/proteinbert': 0.5729976347154231, 'cosine/prott5': 0.5861962463960505, 'cosine/seqvec': 0.5581434988264784, 'cosine/protbert-bfd': 0.5618