# BLAST based check for the MF model - biological validation

Comparing the predictions obtained and the blast annotation. 

### 1. Imports and libraries

In [1]:
import pandas as pd

In [2]:
# Loading the BLAST annotation, train set

blast = pd.read_csv('data/test/blast_test_results.tsv', sep = '\t')

train_set = pd.read_csv('data/train/train_set.tsv', sep = "\t")

train_mf = train_set[train_set['aspect'] == 'molecular_function']



In [3]:
# lookup table
# train protein ID to list of known MF GO terms
mf_map = (
    train_mf
    .groupby("Protein_ID")["GO_term"]
    .apply(list)
    .to_dict()
)

blast_mf_preds = []
TOP_N = 5 # number of top blast hits to consider

for pid, hits in blast.groupby("query"):                         # loop over test proteins and their blast hits
    hits = hits.sort_values("bits", ascending=False).head(TOP_N) # sort by blast bit score
    max_bits = hits["bits"].max()                                # normalize according to best hit for the query

    for _, row in hits.iterrows():                               # for each hit take train protein match
        for go in mf_map.get(row["target"], []):                 # retrieve all its MF GO terms
            blast_mf_preds.append(                               # asign MF term to test protein with a normalized confidence score
                (pid, go, row["bits"] / max_bits)
            )

In [4]:
# convert MF preds into a structured df
blast_mf_df = pd.DataFrame(
    blast_mf_preds,
    columns=["Protein_ID", "GO_term", "score"]
)

# keep highest confidence score if MF term is infered multiple times for the same protein
blast_mf_df = blast_mf_df.groupby(
    ["Protein_ID", "GO_term"], as_index=False
)["score"].max()

In [36]:
# loading ML based predictions
mf_predictions = pd.read_csv(
    "metadata/MF/mf_predictions.csv"
)

# combine ml and blast preds
final_mf = pd.concat([mf_predictions, blast_mf_df])

# if same term appears from both, keep the highest confidence term
final_mf = final_mf.groupby(
    ["Protein_ID", "GO_term"], as_index=False
)["score"].max()

In [37]:
# remove zeroes
final_mf = final_mf[final_mf["score"] > 0]

# term sort, desc by conf, per protein, keep top 500 (max terms is way lower)
final_mf = (
    final_mf
    .sort_values(["Protein_ID", "score"], ascending=[True, False])
    .groupby("Protein_ID")
    .head(500)  # MF-only cap
)

In [38]:
# global summary statistics

print("Total MF predictions:", len(final_mf))
final_mf.groupby("Protein_ID").size().describe()

Total MF predictions: 19541


count    1000.000000
mean       19.541000
std        10.031398
min         4.000000
25%        12.000000
50%        17.000000
75%        25.000000
max        65.000000
dtype: float64

In [39]:
# inspecting a single protein part 1
final_mf[final_mf["Protein_ID"] == final_mf["Protein_ID"].iloc[0]].head(10)

Unnamed: 0,Protein_ID,GO_term,score
0,A0A0B4JCV4,GO:0003674,1.0
1,A0A0B4JCV4,GO:0005488,1.0
2,A0A0B4JCV4,GO:0005515,1.0
3,A0A0B4JCV4,GO:0008017,1.0
4,A0A0B4JCV4,GO:0008092,1.0
5,A0A0B4JCV4,GO:0015631,1.0


In [40]:
# convert GO identifiers into biological terms
GO_OBO_PATH = "data/train/go-basic.obo"

go_names = {}

with open(GO_OBO_PATH, "r") as f:
    current_id = None
    for line in f:
        line = line.strip()
        if line.startswith("id: GO:"):
            current_id = line.split("id: ")[1]
        elif line.startswith("name:") and current_id:
            go_names[current_id] = line.split("name: ")[1]
            current_id = None

print("Loaded GO terms:", len(go_names))

Loaded GO terms: 47637


In [41]:
#check
go_names.get("GO:0005634")  # should be "nucleus"

'nucleus'

In [42]:
# inspecting single protein predictions part 2
pid = final_mf["Protein_ID"].iloc[0]

inspect = (
    final_mf[final_mf["Protein_ID"] == pid]
    .sort_values("score", ascending=False)
    .head(10)
)

inspect["GO_name"] = inspect["GO_term"].map(go_names)
inspect
# they should be closely related terms

Unnamed: 0,Protein_ID,GO_term,score,GO_name
0,A0A0B4JCV4,GO:0003674,1.0,molecular_function
1,A0A0B4JCV4,GO:0005488,1.0,binding
2,A0A0B4JCV4,GO:0005515,1.0,protein binding
3,A0A0B4JCV4,GO:0008017,1.0,microtubule binding
4,A0A0B4JCV4,GO:0008092,1.0,cytoskeletal protein binding
5,A0A0B4JCV4,GO:0015631,1.0,tubulin binding


In [43]:
# global GO name distribution, root terms should have all test proteins (1000)
# shpuld decrease steadily at each few levels
final_mf["GO_name"] = final_mf["GO_term"].map(go_names)

final_mf["GO_name"].value_counts().head(15)

molecular_function                         1000
binding                                     970
protein binding                             954
catalytic activity                          512
organic cyclic compound binding             464
nucleic acid binding                        353
identical protein binding                   353
enzyme binding                              267
transferase activity                        256
ion binding                                 244
protein-containing complex binding          223
catalytic activity, acting on a protein     217
hydrolase activity                          207
DNA binding                                 205
RNA binding                                 188
Name: GO_name, dtype: int64

In [44]:
# inspecting single protein term distribution =  they should all be 1, no duplicates
inspect["GO_name"].apply(lambda x: x.lower()).value_counts()

microtubule binding             1
molecular_function              1
cytoskeletal protein binding    1
tubulin binding                 1
binding                         1
protein binding                 1
Name: GO_name, dtype: int64

In [45]:
# MF terms predicted by blast but not the ML model
# for n number of proteins, blast assigned the x term, but the Ml model didn't
# they should be pretty high level terms

blast_only = blast_mf_df.merge(
    mf_predictions,
    on=["Protein_ID", "GO_term"],
    how="left",
    indicator=True
)

blast_only = blast_only[blast_only["_merge"] == "left_only"]

blast_only["GO_name"] = blast_only["GO_term"].map(go_names)

blast_only["GO_name"].value_counts().head(10)

identical protein binding             125
protein-containing complex binding    119
ion binding                            97
protein dimerization activity          95
enzyme binding                         93
organic cyclic compound binding        77
cation binding                         73
protein homodimerization activity      73
kinase binding                         64
protein domain specific binding        59
Name: GO_name, dtype: int64

In [46]:
# global comparison
ml_terms = set(mf_predictions["GO_term"])
blast_terms = set(blast_mf_df["GO_term"])

shared_terms = ml_terms & blast_terms
ml_only_terms = ml_terms - blast_terms
blast_only_terms = blast_terms - ml_terms

print("ML terms total:", len(ml_terms))
print("BLAST terms total:", len(blast_terms))
print("Shared terms:", len(shared_terms))
print("ML-only terms:", len(ml_only_terms))
print("BLAST-only terms:", len(blast_only_terms))

ML terms total: 692
BLAST terms total: 769
Shared terms: 664
ML-only terms: 28
BLAST-only terms: 105


In [47]:
# checking the ML only term

ml_only_terms = ml_terms - blast_terms
len(ml_only_terms), ml_only_terms

(28,
 {'GO:0000295',
  'GO:0003688',
  'GO:0004396',
  'GO:0004549',
  'GO:0004568',
  'GO:0005184',
  'GO:0005346',
  'GO:0005381',
  'GO:0005385',
  'GO:0005484',
  'GO:0008138',
  'GO:0008376',
  'GO:0008527',
  'GO:0015215',
  'GO:0015216',
  'GO:0015252',
  'GO:0015459',
  'GO:0015645',
  'GO:0015929',
  'GO:0015932',
  'GO:0016273',
  'GO:0016274',
  'GO:0016812',
  'GO:0018455',
  'GO:0052745',
  'GO:0071855',
  'GO:0120014',
  'GO:0160041'})

In [50]:
# mapping to names
# expecting specific subcelular structures that BLAST does not transfer and vice-versa
ml_only_df = (
    pd.DataFrame({"GO_term": list(ml_only_terms)})
)
ml_only_df["GO_name"] = ml_only_df["GO_term"].map(go_names)
ml_only_df

Unnamed: 0,GO_term,GO_name
0,GO:0005484,SNAP receptor activity
1,GO:0015932,nucleobase-containing compound transmembrane t...
2,GO:0015459,potassium channel regulator activity
3,GO:0008138,protein tyrosine/serine/threonine phosphatase ...
4,GO:0015929,hexosaminidase activity
5,GO:0015252,proton channel activity
6,GO:0008376,acetylgalactosaminyltransferase activity
7,GO:0015645,fatty acid ligase activity
8,GO:0160041,neuropeptide activity
9,GO:0015215,nucleotide transmembrane transporter activity


In [51]:
# checking the blast only terms

blast_only_df = (
    pd.DataFrame({"GO_term": list(blast_only_terms)})
)
blast_only_df["GO_name"] = blast_only_df["GO_term"].map(go_names)
blast_only_df.sort_values("GO_name")

# found some protein complexes which ML will miss unless all component proteins are present in the training

Unnamed: 0,GO_term,GO_name
28,GO:0071889,14-3-3 protein binding
29,GO:0051117,ATPase binding
3,GO:0016289,CoA hydrolase activity
18,GO:0000217,DNA secondary structure binding
43,GO:0010181,FMN binding
...,...,...
98,GO:0001221,transcription coregulator binding
83,GO:0001222,transcription corepressor binding
6,GO:0005160,transforming growth factor beta receptor binding
73,GO:0032813,tumor necrosis factor receptor superfamily bin...


In [52]:
# terms that only Blast assigned, Blast specific terms and in how many proteins it appears
# extremely rare, specific and specialized terms expected, ML can't pick them up
# many low numbers, most terms should have 1 protein (toggle head to check)

blast_only_counts = (
    blast_mf_df[blast_mf_df["GO_term"].isin(blast_only_terms)]
    .groupby("GO_term")["Protein_ID"]
    .nunique()
    .reset_index(name="protein_count")
    .merge(blast_only_df, on="GO_term")
    .sort_values("protein_count", ascending=False)
)

blast_only_counts.head(10)

Unnamed: 0,GO_term,protein_count,GO_name
7,GO:0001221,14,transcription coregulator binding
89,GO:0051117,14,ATPase binding
71,GO:0042826,13,histone deacetylase binding
104,GO:1990841,12,promoter-specific chromatin binding
67,GO:0035254,12,glutamate receptor binding
21,GO:0005080,11,protein kinase C binding
11,GO:0002020,11,protease binding
78,GO:0043621,10,protein self-association
8,GO:0001222,9,transcription corepressor binding
81,GO:0046332,8,SMAD binding
