# BLAST based check for the CC model - biological validation
Checking CC annotations derived from sequence homology and comparing them to the ML model. BLAST is not used in the model, and for the final predictions.

In [1]:
import pandas as pd
# loading blast results for test set on train set
blast = pd.read_csv(
    "data/test/blast_test_results.tsv",
    sep="\t",
)

In [2]:
for col in ["bits", "evalue", "score"]:  # names of the blast columns
    blast[col] = pd.to_numeric(blast[col], errors="coerce") # orignal=str, needed=numeric

# check
print(blast.dtypes) 

query      object
target     object
bits      float64
evalue    float64
score       int64
dtype: object


In [3]:
# loading train set
train_cc = pd.read_csv(
    "data/train/train_set.tsv",
    sep="\t"
)
train_cc = train_cc[train_cc["aspect"] == "cellular_component"]

In [4]:
# lookup table
# train protein ID to list of known CC GO terms
cc_map = (
    train_cc
    .groupby("Protein_ID")["GO_term"]
    .apply(list)
    .to_dict()
)

blast_cc_preds = []
TOP_N = 5 # number of top blast hits to consider

for pid, hits in blast.groupby("query"):                         # loop over test proteins and their blast hits
    hits = hits.sort_values("bits", ascending=False).head(TOP_N) # sort by blast bit score
    max_bits = hits["bits"].max()                                # normalize according to best hit for the query

    for _, row in hits.iterrows():                               # for each hit take train protein match
        for go in cc_map.get(row["target"], []):                 # retrieve all its CC GO terms
            blast_cc_preds.append(                               # asign CC term to test protein with a normalized confidence score
                (pid, go, row["bits"] / max_bits)
            )

In [5]:
# convert CC preds into a structured df
blast_cc_df = pd.DataFrame(
    blast_cc_preds,
    columns=["Protein_ID", "GO_term", "score"]
)

# keep highest confidence score if CC term is infered multiple times for the same protein
blast_cc_df = blast_cc_df.groupby(
    ["Protein_ID", "GO_term"], as_index=False
)["score"].max()

In [21]:
# loading ML based predictions
cc_predictions = pd.read_csv(
    "metadata/CC/cc_predictions.csv"
)

# combine ml and blast preds
final_cc = pd.concat([cc_predictions, blast_cc_df])

# if same term appears from both, keep the highest confidence term
final_cc = final_cc.groupby(
    ["Protein_ID", "GO_term"], as_index=False
)["score"].max()

In [22]:
# remove zeroes
final_cc = final_cc[final_cc["score"] > 0]

# term sort, desc by conf, per protein, keep top 500 (max terms is way lower)
final_cc = (
    final_cc
    .sort_values(["Protein_ID", "score"], ascending=[True, False])
    .groupby("Protein_ID")
    .head(500)  # CC-only cap
)

In [23]:
# global summary statistics

print("Total CC predictions:", len(final_cc))
final_cc.groupby("Protein_ID").size().describe()

Total CC predictions: 36305


count    1000.000000
mean       36.305000
std        17.163991
min         5.000000
25%        24.000000
50%        33.000000
75%        45.000000
max       110.000000
dtype: float64

In [24]:
# inspecting a single protein part 1
final_cc[final_cc["Protein_ID"] == final_cc["Protein_ID"].iloc[0]].head(10)

Unnamed: 0,Protein_ID,GO_term,score
0,A0A0B4JCV4,GO:0000235,1.0
5,A0A0B4JCV4,GO:0000922,1.0
6,A0A0B4JCV4,GO:0005575,1.0
7,A0A0B4JCV4,GO:0005622,1.0
10,A0A0B4JCV4,GO:0005737,1.0
11,A0A0B4JCV4,GO:0005813,1.0
12,A0A0B4JCV4,GO:0005815,1.0
13,A0A0B4JCV4,GO:0005818,1.0
14,A0A0B4JCV4,GO:0005819,1.0
15,A0A0B4JCV4,GO:0005856,1.0


In [25]:
# convert GO identifiers into biological terms
GO_OBO_PATH = "data/train/go-basic.obo"

go_names = {}

with open(GO_OBO_PATH, "r") as f:
    current_id = None
    for line in f:
        line = line.strip()
        if line.startswith("id: GO:"):
            current_id = line.split("id: ")[1]
        elif line.startswith("name:") and current_id:
            go_names[current_id] = line.split("name: ")[1]
            current_id = None

print("Loaded GO terms:", len(go_names))

Loaded GO terms: 47637


In [26]:
#check
go_names.get("GO:0005634")  # should be "nucleus"

'nucleus'

In [27]:
# inspecting single protein predictions part 2
pid = final_cc["Protein_ID"].iloc[0]

inspect = (
    final_cc[final_cc["Protein_ID"] == pid]
    .sort_values("score", ascending=False)
    .head(10)
)

inspect["GO_name"] = inspect["GO_term"].map(go_names)
inspect
# they should be closely related terms

Unnamed: 0,Protein_ID,GO_term,score,GO_name
0,A0A0B4JCV4,GO:0000235,1.0,astral microtubule
22,A0A0B4JCV4,GO:0015630,1.0,microtubule cytoskeleton
42,A0A0B4JCV4,GO:0110165,1.0,cellular anatomical entity
41,A0A0B4JCV4,GO:0099513,1.0,polymeric cytoskeletal fiber
40,A0A0B4JCV4,GO:0099512,1.0,supramolecular fiber
39,A0A0B4JCV4,GO:0099081,1.0,supramolecular polymer
38,A0A0B4JCV4,GO:0099080,1.0,supramolecular complex
32,A0A0B4JCV4,GO:0043232,1.0,intracellular non-membrane-bounded organelle
30,A0A0B4JCV4,GO:0043229,1.0,intracellular organelle
29,A0A0B4JCV4,GO:0043228,1.0,non-membrane-bounded organelle


In [28]:
# global GO name distribution, root terms should have all test proteins (1000)
# shpuld decrease steadily at each few levels
final_cc["GO_name"] = final_cc["GO_term"].map(go_names)

final_cc["GO_name"].value_counts().head(15)

cellular_component                          1000
cellular anatomical entity                  1000
intracellular anatomical structure           984
cytoplasm                                    976
organelle                                    972
membrane-bounded organelle                   960
intracellular organelle                      958
intracellular membrane-bounded organelle     947
nucleus                                      724
cytosol                                      722
membrane                                     716
protein-containing complex                   694
cell periphery                               633
plasma membrane                              586
intracellular organelle lumen                585
Name: GO_name, dtype: int64

In [29]:
# inspecting single protein term distribution =  they should all be 1, no duplicates
inspect["GO_name"].apply(lambda x: x.lower()).value_counts()

supramolecular fiber                            1
polymeric cytoskeletal fiber                    1
supramolecular complex                          1
intracellular non-membrane-bounded organelle    1
astral microtubule                              1
non-membrane-bounded organelle                  1
supramolecular polymer                          1
cellular anatomical entity                      1
microtubule cytoskeleton                        1
intracellular organelle                         1
Name: GO_name, dtype: int64

In [30]:
# CC terms predicted by blast but not the ML model
# for n number of proteins, blast assigned the x term, but the Ml model didn't
# they should be pretty high level terms

blast_only = blast_cc_df.merge(
    cc_predictions,
    on=["Protein_ID", "GO_term"],
    how="left",
    indicator=True
)

blast_only = blast_only[blast_only["_merge"] == "left_only"]

blast_only["GO_name"] = blast_only["GO_term"].map(go_names)

blast_only["GO_name"].value_counts().head(10)

extracellular region             67
plasma membrane                  66
cell periphery                   63
mitochondrion                    63
nucleoplasm                      62
membrane-enclosed lumen          61
intracellular organelle lumen    61
organelle lumen                  61
nuclear lumen                    61
membrane                         61
Name: GO_name, dtype: int64

In [31]:
# global comparison
ml_terms = set(cc_predictions["GO_term"])
blast_terms = set(blast_cc_df["GO_term"])

shared_terms = ml_terms & blast_terms
ml_only_terms = ml_terms - blast_terms
blast_only_terms = blast_terms - ml_terms

print("ML terms total:", len(ml_terms))
print("BLAST terms total:", len(blast_terms))
print("Shared terms:", len(shared_terms))
print("ML-only terms:", len(ml_only_terms))
print("BLAST-only terms:", len(blast_only_terms))

ML terms total: 614
BLAST terms total: 650
Shared terms: 599
ML-only terms: 15
BLAST-only terms: 51


In [32]:
# checking the 13 ML only terms

ml_only_terms = ml_terms - blast_terms
len(ml_only_terms), ml_only_terms

(15,
 {'GO:0000421',
  'GO:0005732',
  'GO:0005838',
  'GO:0019028',
  'GO:0022624',
  'GO:0031977',
  'GO:0032010',
  'GO:0032391',
  'GO:0035869',
  'GO:0042025',
  'GO:0042644',
  'GO:0042646',
  'GO:0044423',
  'GO:0045239',
  'GO:0071014'})

In [33]:
# mapping to names
# expecting specific subcelular structures that BLAST does not transfer and vice-versa
ml_only_df = (
    pd.DataFrame({"GO_term": list(ml_only_terms)})
)
ml_only_df["GO_name"] = ml_only_df["GO_term"].map(go_names)
ml_only_df

Unnamed: 0,GO_term,GO_name
0,GO:0031977,thylakoid lumen
1,GO:0005838,proteasome regulatory particle
2,GO:0032010,phagolysosome
3,GO:0019028,viral capsid
4,GO:0035869,ciliary transition zone
5,GO:0071014,post-mRNA release spliceosomal complex
6,GO:0022624,proteasome accessory complex
7,GO:0042025,host cell nucleus
8,GO:0032391,photoreceptor connecting cilium
9,GO:0000421,autophagosome membrane


In [34]:
# checking the blast only terms

blast_only_df = (
    pd.DataFrame({"GO_term": list(blast_only_terms)})
)
blast_only_df["GO_name"] = blast_only_df["GO_term"].map(go_names)
blast_only_df.sort_values("GO_name")

# found some protein complexes which ML will miss unless all component proteins are present in the training

Unnamed: 0,GO_term,GO_name
22,GO:0031082,BLOC complex
36,GO:0016235,aggresome
12,GO:0020007,apical complex
32,GO:0020011,apicoplast
27,GO:0051285,cell cortex of cell tip
0,GO:0009706,chloroplast inner membrane
2,GO:0020016,ciliary pocket
46,GO:0005801,cis-Golgi network
1,GO:0005581,collagen trimer
30,GO:0055028,cortical microtubule


In [35]:
# terms that only Blast assigned, Blast specific terms and in how many proteins it appears
# extremely rare, specific and specialized terms expected, ML can't pick them up
# many low numbers, most terms should have 1 protein (toggle head to check)

blast_only_counts = (
    blast_cc_df[blast_cc_df["GO_term"].isin(blast_only_terms)]
    .groupby("GO_term")["Protein_ID"]
    .nunique()
    .reset_index(name="protein_count")
    .merge(blast_only_df, on="GO_term")
    .sort_values("protein_count", ascending=False)
)

blast_only_counts

Unnamed: 0,GO_term,protein_count,GO_name
22,GO:0016363,9,nuclear matrix
20,GO:0016234,9,inclusion body
34,GO:0043073,9,germ cell nucleus
3,GO:0001673,6,male germ cell nucleus
21,GO:0016235,6,aggresome
44,GO:0097225,5,sperm midpiece
32,GO:0032421,5,stereocilium bundle
50,GO:1904724,4,tertiary granule lumen
39,GO:0051285,4,cell cortex of cell tip
6,GO:0005640,4,nuclear outer membrane
