# Functional Evaluations
Haerang Lee

Find out a way to look into functional agreements.

In [1]:
from google.cloud import storage
import argparse
import gzip
import os
import sys
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from utils import gcs_utils as gcs
from utils import model_and_evaluate_cluster as ev
import urllib.parse
import urllib.request

import io 

import importlib
import hdbscan
import networkx as nx 
import pickle

import obonet

In [29]:
importlib.reload(ev)

<module 'utils.model_and_evaluate_cluster' from '/Users/haeranglee/Documents/pss/utils/model_and_evaluate_cluster.py'>

# Tutorial

In [2]:
# Import possible pairs
prefix='model_outputs/no_cluster_size_limit/'
keys = gcs.list_file_paths(prefix)[1:]

In [15]:
goa = pd.read_csv( io.BytesIO(gcs.download_blob("functional_sim/data/goa_human.gaf.gz")), 
                                compression='gzip', 
                                header=None,
                                skiprows=41,    # hard-coded. May be different for other gaf files.
                                sep='\t')

goa.columns=["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "Reference", 
                         "Evidence Code", "With or From", "Aspect", "Name", "Synonym", "Type", 
                         "Taxon", "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [46]:

for outs in [(protein_combo.split('-')[0],protein_combo, key) for protein_combo, key in protein_combos]:
    model_code, file_name, key = outs 
    print(model_code)

A1
A1EP
A2
A2EP
B1
B1EP
B2
B2EP
B3EP_A
B3EP_B
C1
A3EP
B1EPA
B1EPB
B1EPC
B2EPA
B2EPB
B2EPC
B3EP_C
B3EP_D
B4_A
B4_B
B4_C
B5_A
B5_B
B5_C


In [None]:
# Initiate an array to hold the results 
# all_results = []

In [49]:

for outs in [(protein_combo.split('-')[0],protein_combo, key) for protein_combo, key in protein_combos]:
    model_code, file_name, key = outs 
    print(model_code)
    if model_code in ['A1', 'A2', 'B1', 'B2', 'C1']:  # We decided to not use these 
        print('skipped!')
        continue
    if model_code in [code for code, _, _ in all_results]:
        print('skipped b/c already exists')
        continue 
    # Download data 
    all_protein_combos_per_cluster = gcs.download_parquet(gcs.uri_to_bucket_and_key(key)[1])
    funsim_result = ev.funsim_evaluator(all_protein_combos_per_cluster, goa)
    funsim_result.funsim()
    cluster_funsim, protein_pair_funsim = funsim_result.cluster_funsim, funsim_result.protein_pair_funsim
    
    metric_coverage = sum(cluster_funsim.num_pairs_with_funsim)/sum(cluster_funsim.num_pairs)
    avg_funsim = protein_pair_funsim.funsim.mean()
    results = [model_code, avg_funsim, metric_coverage]
    
    all_results.append(results)


A1
skipped!
A1EP
skipped b/c already exists
A2
skipped!
A2EP
skipped b/c already exists
B1
skipped!
B1EP
skipped b/c already exists
B2
skipped!
B2EP
skipped b/c already exists
B3EP_A
skipped b/c already exists
B3EP_B
skipped b/c already exists
C1
skipped!
A3EP
2021-Dec-02 22:13:01 Total number of proteins in GO annotations: 18240
2021-Dec-02 22:13:01 IC_t created
2021-Dec-02 22:13:02 Dictionary of proteins and their GO terms lookup created
2021-Dec-02 22:13:02 Funsim calculated.
2021-Dec-02 22:13:02 Funsim summary by cluster done.
2021-Dec-02 22:13:02 Get NP Arr of GO terms for each protein
2021-Dec-02 22:13:02 Turn GO terms into dict
2021-Dec-02 22:13:02 Map GO desc...
2021-Dec-02 22:13:03 Mapping GO desc done.
2021-Dec-02 22:13:03 Common GO term sumary per cluster processed.
2021-Dec-02 22:13:03 Merged cluster-level funsim score with GO summary.
B1EPA
2021-Dec-02 22:13:04 Total number of proteins in GO annotations: 18240
2021-Dec-02 22:13:04 IC_t created
2021-Dec-02 22:13:05 Dictiona

2021-Dec-02 22:14:30 Dictionary of proteins and their GO terms lookup created
2021-Dec-02 22:14:40 Funsim calculated.
2021-Dec-02 22:14:40 Funsim summary by cluster done.
2021-Dec-02 22:14:40 Get NP Arr of GO terms for each protein
2021-Dec-02 22:14:40 Turn GO terms into dict
2021-Dec-02 22:14:40 Map GO desc...
2021-Dec-02 22:14:40 Mapping GO desc done.
2021-Dec-02 22:14:40 Common GO term sumary per cluster processed.
2021-Dec-02 22:14:40 Merged cluster-level funsim score with GO summary.
B5_C
2021-Dec-02 22:14:41 Total number of proteins in GO annotations: 18240
2021-Dec-02 22:14:41 IC_t created
2021-Dec-02 22:14:42 Dictionary of proteins and their GO terms lookup created
2021-Dec-02 22:14:47 Funsim calculated.
2021-Dec-02 22:14:47 Funsim summary by cluster done.
2021-Dec-02 22:14:48 Get NP Arr of GO terms for each protein
2021-Dec-02 22:14:48 Turn GO terms into dict
2021-Dec-02 22:14:48 Map GO desc...
2021-Dec-02 22:14:48 Mapping GO desc done.
2021-Dec-02 22:14:48 Common GO term suma

In [57]:
for results in all_results:
    print(*results)

A1EP 0.025689474719644105 0.802058291315003
A2EP 0.3942850149789629 0.9731551317623458
B1EP 0.02426301881073859 0.8042530083094852
B2EP 0.5021110077655506 0.9388061747689945
B3EP_A 0.5143044140935744 0.9404035931598473
B3EP_B 0.4364876394172102 0.9276454823591186
A3EP 0.7391970741249122 0.974443889027743
B1EPA 0.0236463881763495 0.7925450916393154
B1EPB 0.025405000221041408 0.8027933935513437
B1EPC 0.026237588923362674 0.8044042487222828
B2EPA 0.4600550662672017 0.9219690717950558
B2EPB 0.45024964659863936 0.9184939809262382
B2EPC 0.47715209912173945 0.8623155096158625
B3EP_C 0.5036136426484419 0.9810276285877486
B3EP_D 0.5031904752363127 0.9308989985119794
B4_A 0.024731967296761817 0.7958824630357477
B4_B 0.02457035389994004 0.7910389608349266
B4_C 0.024221499982038406 0.7946383381072889
B5_A 0.024455814115835105 0.8154816669293619
B5_B 0.025606114708819683 0.7966499291545202
B5_C 0.025698134713331038 0.8127084210085915


In [51]:
all_results

[['A1EP', 0.025689474719644105, 0.802058291315003],
 ['A2EP', 0.3942850149789629, 0.9731551317623458],
 ['B1EP', 0.02426301881073859, 0.8042530083094852],
 ['B2EP', 0.5021110077655506, 0.9388061747689945],
 ['B3EP_A', 0.5143044140935744, 0.9404035931598473],
 ['B3EP_B', 0.4364876394172102, 0.9276454823591186],
 ['A3EP', 0.7391970741249122, 0.974443889027743],
 ['B1EPA', 0.0236463881763495, 0.7925450916393154],
 ['B1EPB', 0.025405000221041408, 0.8027933935513437],
 ['B1EPC', 0.026237588923362674, 0.8044042487222828],
 ['B2EPA', 0.4600550662672017, 0.9219690717950558],
 ['B2EPB', 0.45024964659863936, 0.9184939809262382],
 ['B2EPC', 0.47715209912173945, 0.8623155096158625],
 ['B3EP_C', 0.5036136426484419, 0.9810276285877486],
 ['B3EP_D', 0.5031904752363127, 0.9308989985119794],
 ['B4_A', 0.024731967296761817, 0.7958824630357477],
 ['B4_B', 0.02457035389994004, 0.7910389608349266],
 ['B4_C', 0.024221499982038406, 0.7946383381072889],
 ['B5_A', 0.024455814115835105, 0.8154816669293619],
 ['

In [45]:
all_protein_combos_per_cluster.head()

Unnamed: 0,query_protein,target_protein,cluster,protein_A,protein_B
0,AF-Q5RL73-F1-model_v1,AF-Q9H1X3-F1-model_v1,0,AF-Q5RL73-F1-model_v1,AF-Q9H1X3-F1-model_v1
1,AF-Q5RL73-F1-model_v1,AF-O95807-F1-model_v1,0,AF-O95807-F1-model_v1,AF-Q5RL73-F1-model_v1
2,AF-Q5RL73-F1-model_v1,AF-P59773-F1-model_v1,0,AF-P59773-F1-model_v1,AF-Q5RL73-F1-model_v1
3,AF-Q5RL73-F1-model_v1,AF-Q6UX52-F1-model_v1,0,AF-Q5RL73-F1-model_v1,AF-Q6UX52-F1-model_v1
4,AF-Q9H1X3-F1-model_v1,AF-Q5RL73-F1-model_v1,0,AF-Q5RL73-F1-model_v1,AF-Q9H1X3-F1-model_v1


In [26]:
all_protein_combos_per_cluster = gcs.download_parquet(
    'model_outputs/no_cluster_size_limit/otherparams/A3EP-DBSCAN-CurveNet-all_protein_combos_per_cluster.parquet')

In [27]:
funsim_result = ev.funsim_evaluator(all_protein_combos_per_cluster, goa)

2021-Dec-02 21:46:40 Total number of proteins in GO annotations: 18240
2021-Dec-02 21:46:40 IC_t created
2021-Dec-02 21:46:40 Dictionary of proteins and their GO terms lookup created


In [28]:
funsim_result.funsim()
cluster_funsim, protein_pair_funsim = funsim_result.cluster_funsim, funsim_result.protein_pair_funsim

2021-Dec-02 21:46:42 Funsim calculated.
2021-Dec-02 21:46:42 Funsim summary by cluster done.
2021-Dec-02 21:46:42 Get NP Arr of GO terms for each protein
2021-Dec-02 21:46:42 Turn GO terms into dict
2021-Dec-02 21:46:42 Map GO desc...
2021-Dec-02 21:46:42 Mapping GO desc done.
2021-Dec-02 21:46:42 Common GO term sumary per cluster processed.
2021-Dec-02 21:46:42 Merged cluster-level funsim score with GO summary.


In [29]:
cluster_funsim.head()

Unnamed: 0,num_pairs,num_pairs_with_funsim,funsim,perc_pairs_w_funsim,cluster,go,go_summary
0,3,3,0.962196,1.0,0,"{'GO:0004930': 3, 'GO:0005509': 3, 'GO:0005515...","{'GO:0004930': {'Num. Protein': 3, 'Name': 'G ..."
1,1,1,0.325129,1.0,1,"{'GO:0003723': 1, 'GO:0005198': 1, 'GO:0005200...","{'GO:0005515': {'Num. Protein': 2, 'Name': 'pr..."
2,1,1,0.336002,1.0,2,"{'GO:0003674': 1, 'GO:0003712': 2, 'GO:0003713...","{'GO:0003712': {'Num. Protein': 2, 'Name': 'tr..."
3,1,1,1.0,1.0,3,"{'GO:0000978': 2, 'GO:0000981': 2, 'GO:0005515...","{'GO:0000978': {'Num. Protein': 2, 'Name': 'RN..."
4,666,666,0.798678,1.0,4,"{'GO:0000978': 36, 'GO:0000981': 35, 'GO:00012...","{'GO:0000978': {'Num. Protein': 36, 'Name': 'R..."


In [34]:
print(sum(cluster_funsim.num_pairs_with_funsim))
print(sum(cluster_funsim.num_pairs))

31190
32008


In [32]:
print(protein_pair_funsim.shape)
print(protein_pair_funsim.dropna().shape)

(32009, 4)
(31191, 4)


In [35]:
protein_pair_funsim.head()

Unnamed: 0,protein_A,protein_B,cluster,funsim
1,Q9HCU4,Q9NYQ6,0,1.0
2,Q9NYQ6,Q9NYQ7,0,0.943294
5,Q9HCU4,Q9NYQ7,0,0.943294
1,P26232,P35221,1,0.325129
1,P98168,P98169,2,0.336002


In [36]:
protein_pair_funsim.funsim.mean()

0.7391970741249122

Here's a function that will create a dataframe from GO summary of each cluster. In the below example, I pull the stats for cluster number 2. 

In [35]:
clusterno = 2
cluster_go_summary = funsim_result.get_go_summary_df(clusterno=clusterno)
cluster_go_summary

Unnamed: 0,Num. Protein,Name,Description
GO:0005509,5,calcium ion binding,Binding to a calcium ion (Ca2+).
GO:0046914,5,transition metal ion binding,Binding to a transition metal ions; a transiti...
GO:0005515,3,protein binding,Binding to a protein.
GO:0005198,2,structural molecule activity,The action of a molecule that contributes to t...
GO:0003723,1,RNA binding,Binding to an RNA molecule or a portion thereof.
GO:0030280,1,structural constituent of skin epidermis,The action of a molecule that contributes to t...
GO:0048306,1,calcium-dependent protein binding,Binding to a protein or protein complex in the...


# Background

## Gene Ontology

More info on Gene Ontology: http://geneontology.org/docs/ontology-documentation/
1. **Molecular function**: describe activities that occur at the molecular level, such as “catalysis” or “transport”. GO molecular function terms represent activities rather than the entities (molecules or complexes) that perform the actions
1. **Cellular component**: locations relative to cellular structures in which a gene product performs a function, either cellular compartments (e.g., mitochondrion), or stable macromolecular complexes of which they are parts (e.g., the ribosome)
1. **Biological process**: The larger processes, or ‘biological programs’ accomplished by multiple molecular activities. Examples of broad biological process terms are DNA repair or signal transduction.

## Functional Similarity Formula


Funcsim methodology from https://www.nature.com/articles/s41598-018-30455-0
> Functional similarity of a gene pair or a set is determined by the semantic similarities of the GO terms annotating the gene pair or set. Semantic similarity defines a distance between terms in the semantic space of GO and is quantified by the information contents (IC) of the terms. The information content (IC) of a GO term t is defined by negative log-likelihood:
$$IC(t)=-log(p(t))$$
> where term probability P(t) of term t is determined from the annotations of the corpus (corpus-based) or from the structure of the DAG (structure-based). The intuition is that terms in lower levels of DAG, that is, the terms with lower probability carry more specific information than the terms at higher levels in the hierarchy. Corpus-based methods evaluate the term probability as
$$p(t)= \frac{M}{N}$$
where M is the number of genes annotated by term t and N is the total number of genes in the annotating corpus.


## Data: Human Protein to GO Mapping

GAF: GO annotation files. 

Dataset `goa_human.gaf` downloaded from http://current.geneontology.org/products/pages/downloads.html
> **Filtered Files**
>
> These files are taxon-specific and reflect the work of specific projects, primarily the model organisms database groups, to provide comprehensive, non-redundant annotation files for their organism. All the files in this table have been filtered using the annotation file QC pipeline. A major component to the filtering is the requirement that particular taxon IDs can only be included within the association files provided by specific projects; the current list of authoritative groups and major model organisms can be found below. 

```
Homo sapiens
EBI Gene Ontology Annotation Database (goa) 	protein 	543477 	goa_human.gaf (gzip)
```
Data dictionary: http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/




In [4]:
a_file = gzip. open("functional_sim/data/goa_human.gaf.gz", "rb")
contents = a_file. read()

In [178]:
print(contents.decode('utf-8')[0:1423])

!gaf-version: 2.2
!
!generated-by: GOC
!
!date-generated: 2021-10-27T15:09
!
!Header from source association file:
!
!generated-by: GOC
!
!date-generated: 2021-10-27T04:08
!
!Header from goa_human source association file:
!
!The set of protein accessions included in this file is based on UniProt reference proteomes, which provide one protein per gene.
!They include the protein sequences annotated in Swiss-Prot or the longest TrEMBL transcript if there is no Swiss-Prot record.
!If a particular protein accession is not annotated with GO, then it will not appear in this file.
!
!Note that the annotation set in this file is filtered in order to reduce redundancy; the full, unfiltered set can be found in
!ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gz
!
!date-generated: 2021-06-16 11:28
!generated-by: UniProt
!go-version: http://purl.obolibrary.org/obo/go/releases/2021-06-06/extensions/go-plus.owl
!
!
!Header copied from paint_goa_human_valid.gaf
!Created on Wed Sep  8 

In [5]:
goa = pd.read_csv("functional_sim/data/goa_human.gaf.gz", 
            compression='gzip', 
            header=None,
            skiprows=41, 
            sep='\t')
goa.columns=["DB",
                    "DB Object ID",
                    "DB Object Symbol",
                    "Qualifier",
                    "GO ID",
                    "Reference",
                    "Evidence Code",
                    "With or From",
                    "Aspect",
                    "Name",
                    "Synonym",
                    "Type",
                    "Taxon",
                    "Date",
                    "Assigned By",
                    "Annotation Extension",
                    "Gene Product Form ID"]
goa.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
4,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,,


In [164]:
goa.shape

(609748, 17)

In [149]:
goa["GO ID"].unique().size

18527

In [137]:
goa["DB"].unique()

array(['UniProtKB'], dtype=object)

The GOA human GAF file has 610K rows with protein IDs from UniProt.

In [140]:
goa["Qualifier"].unique()

array(['enables', 'located_in', 'involved_in', 'part_of', 'NOT|enables',
       'NOT|involved_in', 'is_active_in', 'NOT|colocalizes_with',
       'colocalizes_with', 'acts_upstream_of_or_within', 'contributes_to',
       'NOT|located_in', 'NOT|part_of', 'NOT|acts_upstream_of_or_within',
       'acts_upstream_of', 'acts_upstream_of_positive_effect',
       'acts_upstream_of_or_within_positive_effect',
       'acts_upstream_of_or_within_negative_effect', 'NOT|contributes_to',
       'acts_upstream_of_negative_effect',
       'NOT|acts_upstream_of_or_within_negative_effect',
       'NOT|is_active_in'], dtype=object)

In [184]:
len(goa["DB Object ID"].unique())

19788

In [185]:
# DB Object Symbol
len(goa["DB Object Symbol"].unique())

19718

There are about 20,000 unique proteins here. That's a great coverage. 

In [139]:
len(goa["GO ID"].unique())

18527

In [141]:
goa["Evidence Code"].unique()

array(['IEA', 'IDA', 'TAS', 'IPI', 'IEP', 'ISS', 'NAS', 'IMP', 'ISA',
       'HDA', 'EXP', 'ND', 'HEP', 'IC', 'RCA', 'HMP', 'IGI', 'IKR', 'IGC',
       'ISO', 'ISM', 'IBA'], dtype=object)

Taxonomy should be human. Some proteins may be found in multiple organisms other than human, but in the end, every data point in this dataset is in some way related to human.

In [183]:
goa["Taxon"].unique()[0:10]

array(['taxon:9606', 'taxon:9606|taxon:1280', 'taxon:9606|taxon:33892',
       'taxon:9606|taxon:11103', 'taxon:9606|taxon:11052',
       'taxon:9606|taxon:562', 'taxon:9606|taxon:197911',
       'taxon:9606|taxon:90370', 'taxon:9606|taxon:31649',
       'taxon:9606|taxon:1313'], dtype=object)

In [182]:
[taxon for taxon in goa["Taxon"].unique() if '9606' not in taxon]

[]

**QC** 

Can I find all GO in the human GOA dataset within GO BASIC?


I downloaded the GO Term hierarchy. The file I downloaded is `go-basic.obo` from http://geneontology.org/docs/download-ontology/ 

Description of the dataset from the source:
> This is the basic version of the GO, filtered such that the graph is guaranteed to be acyclic and annotations can be propagated up the graph. The relations included are is a, part of, regulates, negatively regulates and positively regulates. This version excludes relationships that cross the 3 GO hierarchies. This version should be used with most GO-based annotation tools.
go.obo and go.

In [2]:
import obonet
import networkx as nx
gobasic = obonet.read_obo("functional_sim/data/go-basic.obo")

In [290]:
goa_goid_set = set(goa["GO ID"])
gobasic_set = set(gobasic.nodes)

In [291]:
goa_goid_set.difference(gobasic_set)

set()

GOA is a subset of gobasic, which is the full graph. 

In [292]:
len(gobasic_set.difference(goa_goid_set))

25323

I want to work only with GOMF. 

In [6]:
r_gobasic = nx.reverse_view(gobasic_sub)

# 'GO:0003674' - This is the code for molecular function, which is the topmost parent
# I'm interested in all the nodes that are 1 distance away from the topmost parent. 


NameError: name 'gobasic_sub' is not defined

In [3]:
# # Do not run again if already calculated. Import pickle file. 
# shortest_from_root = dict(nx.all_pairs_shortest_path_length(r_gobasic))
# with open('functional_sim/intermediary_data/shortest_from_root.pkl', 'wb') as file:
#     pickle.dump(shortest_from_root, file)

with open('functional_sim/intermediary_data/shortest_from_root.pkl', 'rb') as file:
    shortest_from_root = pickle.load(file)    
    

In [11]:
# How many GO's are in the shortest paths? 
len(shortest_from_root)

43850

In [12]:
# Here are how many eventually connect to our root, molecular_function.
len(shortest_from_root['GO:0003674'])

11168

There's a total of 11,168 GOMF terms in the GO-basic graph. In the human GAF dataset, there are only about 4,000. In addition to the species filter, the GAF dataset was filtered further through its QC checks. See [Gene Ontology wiki](http://wiki.geneontology.org/index.php/Release_Pipeline#Annotation_QC_checks) for more info.

In [295]:
# MF + BP + CC

len(shortest_from_root['GO:0003674']
   )+len(shortest_from_root['GO:0008150']
        )+len(shortest_from_root['GO:0005575'])

43850

43850  total GOs covered. These three sub-ontologies cover all the gene ontologies! And these are mutually exclusive categories, since, for this dataset, the cross-sub-ontology relationships have been removed.


In [193]:
# GOMF only 

goa_goid_mf = [goid for goid in set(goa["GO ID"]) if goid in shortest_from_root['GO:0003674'] ]
len(goa_goid_mf)

4431

In [194]:
goa_goid_mf[0:10]

['GO:0005412',
 'GO:0035254',
 'GO:0005035',
 'GO:0031690',
 'GO:0046980',
 'GO:1990247',
 'GO:0050567',
 'GO:0052630',
 'GO:0052814',
 'GO:0003943']

# Data: GO ID, Name, and Desc



But I also want the go term names.

From http://geneontology.org/docs/faq/

>    How do I get the term names for my list of GO ids?
>
>    You can use the YeastMine Analyze tool available at SGD to retrieve the GO term names for each ID.
>
>    Go to the Analyze tool on YeastMine
>    In the Select Type pull down, select GO Term
>    Enter your GO ids or upload a list in the full format (GO:0016020, GO:0016301…)
>    Click on Create List. The tool offers several options to download the list.



In [None]:
pd.DataFrame(goa_goid_mf).to_csv('functional_sim/goa_goid_mf.csv', index=False, header=False)

I uploaded `goa_goid_mf.csv` onto YeastMine and downloaded `yeastmine_results_goa_goid_mf.tsv`.

In [13]:
go_term_names = pd.read_csv('functional_sim/data/yeastmine_results_goa_goid_mf.tsv', sep='\t')
go_term_names.columns = [col[10:] for col in go_term_names.columns]
go_term_names.shape

(4431, 4)

In [14]:
go_term_names.head()

Unnamed: 0,Identifier,Name,Namespace,Description
0,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...
1,GO:0000010,trans-hexaprenyltranstransferase activity,molecular_function,Catalysis of the reaction: all-trans-hexapreny...
2,GO:0000014,single-stranded DNA endodeoxyribonuclease acti...,molecular_function,Catalysis of the hydrolysis of ester linkages ...
3,GO:0000016,lactase activity,molecular_function,Catalysis of the reaction: lactose + H2O = D-g...
4,GO:0000026,"alpha-1,2-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...


In [4]:
alphafold_protein_to_parent_gomf_only = pd.read_parquet('functional_sim/alphafold_protein_to_parent_gomf_only.parquet')

In [5]:
alphafold_protein_to_parent_gomf_only.head()

Unnamed: 0,pdbx_db_accession,db_code,db_name,protein_id,pdbx_seq_one_letter_code,protein_filename,parent_gomf,parent_gomf_name,parent_gomf_desc
0,A0A024R1R8,A0A024R1R8_HUMAN,UNP,A0A024R1R8,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1,,,
1,A0A024RBG1,NUD4B_HUMAN,UNP,A0A024RBG1,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1,GO:0003824,catalytic activity,Catalysis of a biochemical reaction at physiol...
2,A0A024RBG1,NUD4B_HUMAN,UNP,A0A024RBG1,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1,GO:0005488,binding,"The selective, non-covalent, often stoichiomet..."
3,A0A024RCN7,A0A024RCN7_HUMAN,UNP,A0A024RCN7,MERSFVWLSCLDSDSCNLTFRLGEVESHACSPSLLWNLLTQYLPPG...,AF-A0A024RCN7-F1-model_v1,,,
4,A0A075B6H5,A0A075B6H5_HUMAN,UNP,A0A075B6H5,METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVIC...,AF-A0A075B6H5-F1-model_v1,,,


In [40]:
alphafold_protein_to_parent_gomf_only[
    ~alphafold_protein_to_parent_gomf_only.parent_gomf.isna()
].protein_id.unique().size

15412

In [8]:
protein_goa_mf = pd.read_parquet('functional_sim/protein_goa_mf.parquet')

In [9]:
protein_goa_mf.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
240,UniProtKB,A0A075B734,LOC100509620,enables,GO:0015267,GO_REF:0000002,IEA,InterPro:IPR000425,F,Putative aquaporin-7-like protein 3,LOC100509620,protein,taxon:9606,20210612,InterPro,,


In [38]:
protein_goa_mf['DB Object ID'].unique().size

18240

In [13]:
# Here are how many eventually connect to our root, molecular_function.
shortest_from_root['GO:0003674']

{'GO:0003674': 0,
 'GO:0045182': 1,
 'GO:0090729': 1,
 'GO:0005488': 1,
 'GO:0060089': 1,
 'GO:0003824': 1,
 'GO:0140313': 1,
 'GO:0140299': 1,
 'GO:0140657': 1,
 'GO:0140104': 1,
 'GO:0005215': 1,
 'GO:0003774': 1,
 'GO:0045735': 1,
 'GO:0140691': 1,
 'GO:0140489': 1,
 'GO:0044183': 1,
 'GO:0140110': 1,
 'GO:0005198': 1,
 'GO:0140522': 1,
 'GO:0016209': 1,
 'GO:0060090': 1,
 'GO:0098772': 1,
 'GO:0031386': 1,
 'GO:0038024': 1,
 'GO:0140223': 1,
 'GO:0140488': 2,
 'GO:0017056': 2,
 'GO:0008430': 2,
 'GO:0033226': 2,
 'GO:0004694': 2,
 'GO:0016829': 2,
 'GO:0070287': 2,
 'GO:0036310': 2,
 'GO:0036402': 2,
 'GO:0061783': 2,
 'GO:0031992': 2,
 'GO:0015643': 2,
 'GO:0140545': 2,
 'GO:0016491': 2,
 'GO:0140487': 2,
 'GO:0016887': 2,
 'GO:1904517': 2,
 'GO:0030246': 2,
 'GO:0097159': 2,
 'GO:0140597': 2,
 'GO:0044590': 2,
 'GO:0042626': 2,
 'GO:0140414': 2,
 'GO:0140605': 2,
 'GO:0140490': 2,
 'GO:1901363': 2,
 'GO:0140442': 2,
 'GO:0140311': 2,
 'GO:1902670': 2,
 'GO:0070463': 2,
 'GO:00305

In [7]:
gobasic = obonet.read_obo("functional_sim/data/go-basic.obo")
def filter_edge(u,v,e):
    if 'is_a' in e or 'part_of' in e:
        return True
    else:
        return False
    
gobasic_sub = nx.subgraph_view(gobasic, filter_edge=filter_edge)
r_gobasic = nx.reverse_view(gobasic_sub)


In [20]:
second_layer = r_gobasic['GO:0003674'] 

In [21]:
second_layer = r_gobasic['GO:0003674'] 
topmf = list(dict(second_layer).keys())
topmf

['GO:0003774',
 'GO:0003824',
 'GO:0005198',
 'GO:0005215',
 'GO:0005488',
 'GO:0016209',
 'GO:0031386',
 'GO:0038024',
 'GO:0044183',
 'GO:0045182',
 'GO:0045735',
 'GO:0060089',
 'GO:0060090',
 'GO:0090729',
 'GO:0098772',
 'GO:0140104',
 'GO:0140110',
 'GO:0140223',
 'GO:0140299',
 'GO:0140313',
 'GO:0140489',
 'GO:0140522',
 'GO:0140657',
 'GO:0140691']

In [22]:
second_layer = r_gobasic['GO:0003674'] 
topmf = list(dict(second_layer).keys())
topmf
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)


In [23]:
go_parent_mf_map.shape

(4630, 2)

In [24]:
go_parent_mf_map.head()

Unnamed: 0,gomf,parent_mf
0,GO:0003723,GO:0005488
1,GO:0046872,GO:0005488
2,GO:0052840,GO:0003824
3,GO:0052842,GO:0003824
4,GO:0015267,GO:0005215


In [25]:
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

In [26]:
clu_df_gomf.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID,gomf,parent_mf
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,,GO:0003723,GO:0005488
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,,GO:0046872,GO:0005488
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,,GO:0052840,GO:0003824
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,,GO:0052842,GO:0003824
4,UniProtKB,A0A075B734,LOC100509620,enables,GO:0015267,GO_REF:0000002,IEA,InterPro:IPR000425,F,Putative aquaporin-7-like protein 3,LOC100509620,protein,taxon:9606,20210612,InterPro,,,GO:0015267,GO:0005215


In [29]:
clu_df_gomf.parent_mf.unique()#.size

array(['GO:0005488', 'GO:0003824', 'GO:0005215', 'GO:0098772',
       'GO:0060089', 'GO:0005198', 'GO:0140110', nan, 'GO:0016209',
       'GO:0038024', 'GO:0140657', 'GO:0060090', 'GO:0003774',
       'GO:0140299', 'GO:0140104', 'GO:0140223', 'GO:0044183',
       'GO:0045182', 'GO:0140313', 'GO:0031386'], dtype=object)

In [42]:
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
protein_goa_mf_parent_only.head()

Unnamed: 0,protein_id,parent_mf
0,A0A024RBG1,GO:0005488
2,A0A024RBG1,GO:0003824
4,A0A075B734,GO:0005215
5,A0A087WT01,GO:0005488
6,A0A087WUV0,GO:0005488


In [43]:
protein_goa_mf_parent_only.parent_mf.value_counts()

GO:0005488    16299
GO:0003824     5660
GO:0098772     1958
GO:0140110     1941
GO:0060089     1613
GO:0005215     1250
GO:0005198      694
GO:0140657      509
GO:0060090      364
GO:0045182      143
GO:0003774      115
GO:0016209       87
GO:0038024       77
GO:0140104       72
GO:0140223       45
GO:0044183       40
GO:0031386       14
GO:0140299       13
GO:0140313        5
Name: parent_mf, dtype: int64

In [44]:
protein_goa_mf_parent_only.protein_id.unique().size

18240

In [46]:
protein_goa_mf_parent_only.to_parquet('functional_sim/protein_goa_mf_parent_only.parquet')

# Labels for training CurveNet

In [None]:
protein_goa_mf = pd.read_parquet('functional_sim/protein_goa_mf.parquet')

In [10]:
protein_goa_mf.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
240,UniProtKB,A0A075B734,LOC100509620,enables,GO:0015267,GO_REF:0000002,IEA,InterPro:IPR000425,F,Putative aquaporin-7-like protein 3,LOC100509620,protein,taxon:9606,20210612,InterPro,,


In [14]:
dir(funsim_result)

['IC_t',
 'M',
 'N',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'all_protein_combos_per_cluster',
 'cluster_funsim',
 'common_go_in_cluster',
 'funsim',
 'get_go_summary_df',
 'get_nparr_of_go_terms',
 'go_term_names',
 'go_term_names_dict',
 'goa',
 'goa_by_protein',
 'goa_goid_mf',
 'jaccard_sim_protein_go',
 'make_go_ct_dict',
 'map_go_desc',
 'protein_pair_funsim',
 'shortest_from_root',
 'unique_proteins']

In [21]:
# Here's the list of GO terms and their IC 
IC_t_df = pd.DataFrame.from_dict(funsim_result.IC_t, orient='index', columns=['IC_t'])

In [23]:
IC_t_df.head()

Unnamed: 0,IC_t
GO:0000009,9.118225
GO:0000010,9.118225
GO:0000014,7.614148
GO:0000016,9.811372
GO:0000026,8.425078


In [82]:
# For each protein, grab the top N IC terms. Let's see how many there are total

go_with_most_ic = set()

for protein, goterms in funsim_result.goa_by_protein.items():
    go_with_most_ic = go_with_most_ic.union (set(IC_t_df.loc[goterms].nlargest(1, "IC_t", keep='all').index))

len(go_with_most_ic)

4179

Grabbing top value per protein doens't work because some proteins only have some of the higher level labels.  Let's look at the top N GOMF terms by IC_t and see how many proteins are covered.

In [56]:
IC_t_df.IC_t.value_counts()

9.811372    1536
9.118225     758
8.712760     433
8.425078     265
8.201934     233
            ... 
3.717802       1
4.975090       1
4.967185       1
4.322435       1
3.497824       1
Name: IC_t, Length: 162, dtype: int64

Given that there are 1,500 GO terms with the highest information content, that may be too granular. Let's give it a try wit the third level.

In [105]:
IC_target = 8.712760

selected_go = set(IC_t_df[round(IC_t_df.IC_t, 6)==IC_target].index)

print("selected_go with IC = {}: {}".format(IC_target, 
                                            len(selected_go)))

####

selected_proteins = set()

for protein, goterms in funsim_result.goa_by_protein.items():
    if len(goterms.intersection(selected_go))>0:
        selected_proteins.add(protein)

print("Number of proteins directly containing one of selected_go: {}".format(len(selected_proteins)))

### 

go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in selected_go:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

rolled_into_selected_go = set(go_parent_mf_map.gomf)
print("Number of GO terms that roll up into selected_go: {}".format(len(rolled_into_selected_go)))

###
selected_proteins = set()

for protein, goterms in funsim_result.goa_by_protein.items():
    if len(goterms.intersection(rolled_into_selected_go))>0:
        selected_proteins.add(protein)

print("Number of proteins with GO rolling up into selected_go: {}".format(len(selected_proteins)))


selected_go with IC = 8.71276: 433
Number of proteins directly containing one of selected_go: 1086
Number of GO terms that roll up into selected_go: 929
Number of proteins with GO rolling up into selected_go: 4568


In [104]:
IC_target = 8.425078

selected_go = set(IC_t_df[round(IC_t_df.IC_t, 6)==IC_target].index)

print("selected_go with IC = {}: {}".format(IC_target, 
                                            len(selected_go)))

####

selected_proteins = set()

for protein, goterms in funsim_result.goa_by_protein.items():
    if len(goterms.intersection(selected_go))>0:
        selected_proteins.add(protein)

print("Number of proteins directly containing one of selected_go: {}".format(len(selected_proteins)))

### 

go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in selected_go:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

rolled_into_selected_go = set(go_parent_mf_map.gomf)
print("Number of GO terms that roll up into selected_go: {}".format(len(rolled_into_selected_go)))

###
selected_proteins = set()

for protein, goterms in funsim_result.goa_by_protein.items():
    if len(goterms.intersection(rolled_into_selected_go))>0:
        selected_proteins.add(protein)

print("Number of proteins with GO rolling up into selected_go: {}".format(len(selected_proteins)))


selected_go with IC = 8.425078: 265
Number of proteins directly containing one of selected_go: 905
Number of GO terms that roll up into selected_go: 849
Number of proteins with GO rolling up into selected_go: 3317


### How about some of the top most parents...

In [133]:
second_layer = r_gobasic['GO:0003674'] 
topmf = list(dict(second_layer).keys())
topmf


['GO:0003774',
 'GO:0003824',
 'GO:0005198',
 'GO:0005215',
 'GO:0005488',
 'GO:0016209',
 'GO:0031386',
 'GO:0038024',
 'GO:0044183',
 'GO:0045182',
 'GO:0045735',
 'GO:0060089',
 'GO:0060090',
 'GO:0090729',
 'GO:0098772',
 'GO:0140104',
 'GO:0140110',
 'GO:0140223',
 'GO:0140299',
 'GO:0140313',
 'GO:0140489',
 'GO:0140522',
 'GO:0140657',
 'GO:0140691']

In [134]:
len(topmf)

24

In [107]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]

protein_goa_mf_parent_only.parent_mf.value_counts()

GO:0005488    16299
GO:0003824     5660
GO:0098772     1958
GO:0140110     1941
GO:0060089     1613
GO:0005215     1250
GO:0005198      694
GO:0140657      509
GO:0060090      364
GO:0045182      143
GO:0003774      115
GO:0016209       87
GO:0038024       77
GO:0140104       72
GO:0140223       45
GO:0044183       40
GO:0031386       14
GO:0140299       13
GO:0140313        5
Name: parent_mf, dtype: int64

In [111]:
# Let's break GO:0005488
topmf_2 = []
for gomf in topmf:
    if gomf != 'GO:0005488':
        topmf_2.append(gomf)
    
topmf_2 += list(r_gobasic['GO:0005488'] )
len(topmf_2)

74

In [115]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_2:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
protein_goa_mf_parent_only.parent_mf.value_counts()

18240


GO:0005515    13889
GO:0043167     6065
GO:0097159     6036
GO:1901363     5955
GO:0003824     5660
GO:0036094     2504
GO:0097367     2260
GO:0098772     1958
GO:0140110     1941
GO:0060089     1613
GO:0044877     1310
GO:0005215     1250
GO:0008289      803
GO:0005198      694
GO:0003682      575
GO:0140657      509
GO:0033218      410
GO:0060090      364
GO:0030246      272
GO:1901681      264
GO:0003823      193
GO:0045182      143
GO:0005549      123
GO:0003774      115
GO:0072341       93
GO:0016209       87
GO:0042562       86
GO:0140272       79
GO:0038024       77
GO:0140104       72
GO:0051540       67
GO:0050840       58
GO:0140223       45
GO:0044183       40
GO:0050997       40
GO:1901567       23
GO:0048038       18
GO:0042165       18
GO:0031386       14
GO:0046790       13
GO:0140299       13
GO:0043176       12
GO:0015643       10
GO:0008430        8
GO:0043515        6
GO:0140313        5
GO:0046848        4
GO:0050436        4
GO:0019808        3
GO:0070026        3


In [116]:
# Let's break GO:0005488 
topmf_2 = []
for gomf in topmf:
    if gomf != 'GO:0005488':
        topmf_2.append(gomf)
    
topmf_2 += list(r_gobasic['GO:0005488'] )

### break GO:0005515
topmf_3 = []
for gomf in topmf_2:
    if gomf != 'GO:0005515':
        topmf_3.append(gomf)
    
topmf_3 += list(r_gobasic['GO:0005515'] )

len(topmf_3)

182

In [119]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

valct[valct.parent_mf >= 50].shape

18240


Unnamed: 0,parent_mf
GO:0043167,6065
GO:0097159,6036
GO:1901363,5955
GO:0003824,5660
GO:0036094,2504


In [122]:
valct[valct.parent_mf >= 50].shape

(62, 1)

In [121]:
valct.head(100)

Unnamed: 0,parent_mf
GO:0043167,6065
GO:0097159,6036
GO:1901363,5955
GO:0003824,5660
GO:0036094,2504
...,...
GO:0046790,13
GO:0043176,12
GO:0002162,11
GO:0097371,11


Let's break it further

In [137]:
# Let's break GO:0005488 
import copy 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824'  # break third level if protein > 5,000
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = [gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

285

In [141]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(73, 1)


Unnamed: 0,parent_mf
GO:0043169,4352
GO:0003676,3990
GO:0036094,2504
GO:0043168,2417
GO:0016787,2395
GO:0140096,2327
GO:0016740,2306
GO:0097367,2260
GO:1901265,2162
GO:0019899,2047


In [144]:
list(valct[valct.parent_mf >= 2000].index)

['GO:0043169',
 'GO:0003676',
 'GO:0036094',
 'GO:0043168',
 'GO:0016787',
 'GO:0140096',
 'GO:0016740',
 'GO:0097367',
 'GO:1901265',
 'GO:0019899']

In [145]:
# Let's break some more 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824',  # break third level if protein > 5,000
            # break again if protein > 2000
            'GO:0043169',
             'GO:0003676',
             'GO:0036094',
             'GO:0043168',
             'GO:0016787',
             'GO:0140096',
             'GO:0016740',
             'GO:0097367',
             'GO:1901265',
             'GO:0019899'
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = [gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

540

In [146]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(105, 1)


Unnamed: 0,parent_mf
GO:0046872,4255
GO:0003677,2498
GO:0000166,2161
GO:0042802,1963
GO:0098772,1958
GO:0140110,1941
GO:0032553,1915
GO:0035639,1825
GO:0003723,1680
GO:0060089,1613


In [147]:
list(valct[valct.parent_mf >= 2000].index)

['GO:0046872', 'GO:0003677', 'GO:0000166']

In [148]:
# Let's break some more 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824',  # break third level if protein > 5,000
            # break again if protein > 2000
            'GO:0043169',
             'GO:0003676',
             'GO:0036094',
             'GO:0043168',
             'GO:0016787',
             'GO:0140096',
             'GO:0016740',
             'GO:0097367',
             'GO:1901265',
             'GO:0019899',
            # Break again if protein > 2000
            'GO:0046872', 'GO:0003677', 'GO:0000166'
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = [gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

568

In [149]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(113, 1)


Unnamed: 0,parent_mf
GO:0042802,1963
GO:0098772,1958
GO:0140110,1941
GO:0032553,1915
GO:0017076,1912
GO:0035639,1825
GO:0043565,1691
GO:0003690,1682
GO:0003723,1680
GO:0060089,1613


In [150]:
list(valct[valct.parent_mf >= 1500].index)

['GO:0042802',
 'GO:0098772',
 'GO:0140110',
 'GO:0032553',
 'GO:0017076',
 'GO:0035639',
 'GO:0043565',
 'GO:0003690',
 'GO:0003723',
 'GO:0060089',
 'GO:0005102',
 'GO:0001067']

In [151]:
# Let's break some more 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824',  # break third level if protein > 5,000
            # break again if protein > 2000
            'GO:0043169',
             'GO:0003676',
             'GO:0036094',
             'GO:0043168',
             'GO:0016787',
             'GO:0140096',
             'GO:0016740',
             'GO:0097367',
             'GO:1901265',
             'GO:0019899',
            # Break again if protein > 2000
            'GO:0046872', 'GO:0003677', 'GO:0000166',
            # Break again if protein > 1,500
            'GO:0042802',
             'GO:0098772',
             'GO:0140110',
             'GO:0032553',
             'GO:0017076',
             'GO:0035639',
             'GO:0043565',
             'GO:0003690',
             'GO:0003723',
             'GO:0060089',
             'GO:0005102',
             'GO:0001067'
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = [gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

691

In [152]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(130, 1)


Unnamed: 0,parent_mf
GO:0032555,1897
GO:0038023,1613
GO:1990837,1587
GO:0030554,1557
GO:0000976,1530
GO:0005524,1479
GO:0003700,1445
GO:0044877,1310
GO:0005215,1250
GO:0030234,1245


In [153]:
list(valct[valct.parent_mf >= 1500].index)

['GO:0032555', 'GO:0038023', 'GO:1990837', 'GO:0030554', 'GO:0000976']

In [192]:
# Let's break some more 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824',  # break third level if protein > 5,000
            # break again if protein > 2000
            'GO:0043169',
             'GO:0003676',
             'GO:0036094',
             'GO:0043168',
             'GO:0016787',
             'GO:0140096',
             'GO:0016740',
             'GO:0097367',
             'GO:1901265',
             'GO:0019899',
            # Break again if protein > 2000
            'GO:0046872', 'GO:0003677', 'GO:0000166',
            # Break again if protein > 1,500
            'GO:0042802',
             'GO:0098772',
             'GO:0140110',
             'GO:0032553',
             'GO:0017076',
             'GO:0035639',
             'GO:0043565',
             'GO:0003690',
             'GO:0003723',
             'GO:0060089',
             'GO:0005102',
             'GO:0001067',
            # Break again if protein > 1,500
            'GO:0032555', 'GO:0038023', 'GO:1990837', 'GO:0030554', 'GO:0000976'
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = [gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

736

In [193]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(134, 1)


Unnamed: 0,parent_mf
GO:0032559,1544
GO:0005524,1479
GO:0003700,1445
GO:0000977,1436
GO:0004888,1389
GO:0044877,1310
GO:0005215,1250
GO:0030234,1245
GO:0000987,1228
GO:0046914,1102


In [194]:
list(valct[valct.parent_mf >= 500].index)

['GO:0032559',
 'GO:0005524',
 'GO:0003700',
 'GO:0000977',
 'GO:0004888',
 'GO:0044877',
 'GO:0005215',
 'GO:0030234',
 'GO:0000987',
 'GO:0046914',
 'GO:0046983',
 'GO:0008092',
 'GO:0016772',
 'GO:0008289',
 'GO:0016788',
 'GO:0019900',
 'GO:0016491',
 'GO:0005509',
 'GO:0005198',
 'GO:0019904',
 'GO:0016817',
 'GO:0042803',
 'GO:0008233',
 'GO:0140640',
 'GO:0008134',
 'GO:0004672',
 'GO:0003682',
 'GO:0050839',
 'GO:0030545',
 'GO:0140657']

### Break if >500 and add back in parent terms

In [200]:
# Let's break some more 

# Take the top parent terms 
topmf_2 = copy.deepcopy(topmf)

# Here is the list of terms we want to replace with their immediate children 
to_break = ['GO:0005488',  # one of top 18
            'GO:0005515',  # secone break
            'GO:0043167', 'GO:0097159', 'GO:1901363', 'GO:0003824',  # break third level if protein > 5,000
            # break again if protein > 2000
            'GO:0043169',
             'GO:0003676',
             'GO:0036094',
             'GO:0043168',
             'GO:0016787',
             'GO:0140096',
             'GO:0016740',
             'GO:0097367',
             'GO:1901265',
             'GO:0019899',
            # Break again if protein > 2000
            'GO:0046872', 'GO:0003677', 'GO:0000166',
            # Break again if protein > 1,500
            'GO:0042802',
             'GO:0098772',
             'GO:0140110',
             'GO:0032553',
             'GO:0017076',
             'GO:0035639',
             'GO:0043565',
             'GO:0003690',
             'GO:0003723',
             'GO:0060089',
             'GO:0005102',
             'GO:0001067',
            # Break again if protein > 1,500
            'GO:0032555', 'GO:0038023', 'GO:1990837', 'GO:0030554', 'GO:0000976',
            # Break again if protein > 500
            'GO:0032559',
             'GO:0005524',
             'GO:0003700',
             'GO:0000977',
             'GO:0004888',
             'GO:0044877',
             'GO:0005215',
             'GO:0030234',
             'GO:0000987',
             'GO:0046914',
             'GO:0046983',
             'GO:0008092',
             'GO:0016772',
             'GO:0008289',
             'GO:0016788',
             'GO:0019900',
             'GO:0016491',
             'GO:0005509',
             'GO:0005198',
             'GO:0019904',
             'GO:0016817',
             'GO:0042803',
             'GO:0008233',
             'GO:0140640',
             'GO:0008134',
             'GO:0004672',
             'GO:0003682',
             'GO:0050839',
             'GO:0030545',
             'GO:0140657'
           ]

# Look up the children of the above terms ad add to the list 
for gomf in to_break:
    topmf_2 += list(r_gobasic[gomf] )

# Filter out the original GO terms we broke into the chilren
topmf_3 = topmf_2 #[gomf for gomf in topmf_2 if gomf not in to_break]
len(topmf_3)

1258

In [201]:
# Create a mapping 
go_parent_mf_map = pd.DataFrame(columns=["gomf", "parent_mf"])

for gomf in protein_goa_mf["GO ID"].unique():
    for parent_mf in topmf_3:
        if gomf in shortest_from_root[parent_mf]:
            go_parent_mf_map = go_parent_mf_map.append({"gomf": gomf, "parent_mf": parent_mf},
                                                      ignore_index=True)

# Map the mapping into proteins             
clu_df_gomf = protein_goa_mf.merge(go_parent_mf_map,
                  how='left',
                  left_on='GO ID',
                  right_on='gomf'
                 )

# Deduplicate and give counts
protein_goa_mf_parent_only = clu_df_gomf[["DB Object ID", "parent_mf"]].drop_duplicates()
protein_goa_mf_parent_only.columns=["protein_id", "parent_mf"]
print(len(protein_goa_mf_parent_only.protein_id.unique()))
valct = pd.DataFrame(protein_goa_mf_parent_only.parent_mf.value_counts())

print(valct[valct.parent_mf >= 50].shape)
valct.head(10)

18240
(247, 1)


Unnamed: 0,parent_mf
GO:0005488,16299
GO:0005515,13889
GO:0043167,6065
GO:0097159,6036
GO:1901363,5955
GO:0003824,5660
GO:0043169,4352
GO:0046872,4255
GO:0003676,3990
GO:0036094,2504


In [229]:
test = valct.loc[[i for i in valct.index if i not in to_break]]
test.head(15)

Unnamed: 0,parent_mf
GO:0000981,1394
GO:0000978,1208
GO:0022857,1137
GO:0004930,883
GO:0008270,828
GO:0016301,754
GO:0016773,694
GO:0016818,682
GO:0019901,679
GO:0003712,499


In [202]:
print(valct.shape)

(874, 1)


In [203]:
print(valct[valct.parent_mf >= 20].shape)

(370, 1)


In [204]:
protein_goa_mf_parent_only[protein_goa_mf_parent_only.parent_mf.isin(
    list(valct[valct.parent_mf >= 20].index)
)].protein_id.unique().shape

(18064,)

In [205]:
# Add in IC 
protein_goa_mf_parent_only["IC_t"] = protein_goa_mf_parent_only.parent_mf.map(funsim_result.IC_t)

# add in number of proteins per GO 
protein_goa_mf_parent_only["num_proteins_per_parent_mf"] = protein_goa_mf_parent_only.parent_mf.map(
    dict(protein_goa_mf_parent_only.parent_mf.value_counts())
)
protein_goa_mf_parent_only.head()

Unnamed: 0,protein_id,parent_mf,IC_t,num_proteins_per_parent_mf
0,A0A024RBG1,GO:0005488,,16299.0
1,A0A024RBG1,GO:0097159,9.118225,6036.0
2,A0A024RBG1,GO:1901363,9.811372,5955.0
3,A0A024RBG1,GO:0003676,5.101842,3990.0
5,A0A024RBG1,GO:0003723,2.575753,1680.0


In [207]:
protein_goa_mf_parent_only.shape

(191368, 4)

In [208]:
protein_goa_mf_parent_only.protein_id.unique().shape

(18240,)

In [209]:
protein_goa_mf_parent_only.parent_mf.unique().shape

(875,)

In [210]:
protein_goa_mf_parent_only.describe()

Unnamed: 0,IC_t,num_proteins_per_parent_mf
count,148932.0,190643.0
mean,5.600266,4203.42813
std,2.768017,5099.81868
min,0.388019,1.0
25%,3.911475,762.0
50%,5.591865,1941.0
75%,7.731931,5955.0
max,9.811372,16299.0


In [206]:
protein_goa_mf_parent_only.to_parquet("functional_sim/protein_goa_mf_parent_only_lessthan500.parquet")

In [232]:
funsim_result.IC_t['GO:0005488']

KeyError: 'GO:0005488'