# Functional Evaluations
Haerang Lee

Find out a way to look into functional agreements.

In [1]:
from google.cloud import storage
import argparse
import gzip
import os
import sys
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from utils import gcs_utils as gcs
from utils import model_and_evaluate_cluster as ev
import urllib.parse
import urllib.request

import io 

import importlib
import hdbscan
import networkx as nx 

In [2]:
importlib.reload(ev)

<module 'utils.model_and_evaluate_cluster' from '/Users/haeranglee/Documents/pss/utils/model_and_evaluate_cluster.py'>

# Tutorial

In [3]:
# Import possible pairs
prefix='model_outputs/no_cluster_size_limit/'
all_protein_combos_per_cluster = gcs.download_parquet(prefix+'B2-HDBSCAN-SeqVec-all_protein_combos_per_cluster.parquet')

In [5]:
funsim_result = ev.funsim_evaluator(all_protein_combos_per_cluster)

2021-Nov-07 15:38:03 No GO annotations provided. Downloading from google cloud.


  funsim_result = ev.funsim_evaluator(all_protein_combos_per_cluster)


2021-Nov-07 15:38:06 Total number of proteins in GO annotations: 18240
2021-Nov-07 15:38:06 IC_t created
2021-Nov-07 15:38:06 Dictionary of proteins and their GO terms lookup created


In [6]:
funsim_result.funsim()
cluster_funsim, protein_pair_funsim = funsim_result.cluster_funsim, funsim_result.protein_pair_funsim

2021-Nov-07 15:38:13 Funsim calculated.
2021-Nov-07 15:38:13 Funsim summary by cluster done.
2021-Nov-07 15:38:13 Get NP Arr of GO terms for each protein
2021-Nov-07 15:38:13 Turn GO terms into dict
2021-Nov-07 15:38:13 Map GO desc...
2021-Nov-07 15:38:13 Mapping GO desc done.
2021-Nov-07 15:38:13 Common GO term sumary per cluster processed.
2021-Nov-07 15:38:13 Merged cluster-level funsim score with GO summary.


In [7]:
cluster_funsim.head()

Unnamed: 0_level_0,num_pairs,num_pairs_with_funsim,funsim,perc_pairs_w_funsim,cluster,go,go_summary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,45,45,0.477738,1.0,0,"{'GO:0001540': 1, 'GO:0004175': 2, 'GO:0004190...","{'GO:0004190': {'Num. Protein': 10, 'Name': 'a..."
1,276,0,,0.0,1,{},{}
2,15,15,0.40824,1.0,2,"{'GO:0003723': 1, 'GO:0005198': 2, 'GO:0005509...","{'GO:0005509': {'Num. Protein': 5, 'Name': 'ca..."
3,10,10,0.724022,1.0,3,"{'GO:0003674': 1, 'GO:0004252': 5, 'GO:0005515...","{'GO:0004252': {'Num. Protein': 5, 'Name': 'se..."
4,15,15,0.407159,1.0,4,"{'GO:0003674': 3, 'GO:0005515': 4}","{'GO:0005515': {'Num. Protein': 4, 'Name': 'pr..."


In [8]:
protein_pair_funsim.head()

Unnamed: 0,protein_A,protein_B,cluster,funsim
1,O96009,P00797,0,0.428994
2,O96009,P07339,0,0.383279
3,O96009,P0DJD7,0,0.396631
4,O96009,P0DJD8,0,0.37498
5,O96009,P0DJD9,0,0.37498


# Background

## Gene Ontology

More info on Gene Ontology: http://geneontology.org/docs/ontology-documentation/
1. **Molecular function**: describe activities that occur at the molecular level, such as “catalysis” or “transport”. GO molecular function terms represent activities rather than the entities (molecules or complexes) that perform the actions
1. **Cellular component**: locations relative to cellular structures in which a gene product performs a function, either cellular compartments (e.g., mitochondrion), or stable macromolecular complexes of which they are parts (e.g., the ribosome)
1. **Biological process**: The larger processes, or ‘biological programs’ accomplished by multiple molecular activities. Examples of broad biological process terms are DNA repair or signal transduction.

## Functional Similarity Formula


Funcsim methodology from https://www.nature.com/articles/s41598-018-30455-0
> Functional similarity of a gene pair or a set is determined by the semantic similarities of the GO terms annotating the gene pair or set. Semantic similarity defines a distance between terms in the semantic space of GO and is quantified by the information contents (IC) of the terms. The information content (IC) of a GO term t is defined by negative log-likelihood:
$$IC(t)=-log(p(t))$$
> where term probability P(t) of term t is determined from the annotations of the corpus (corpus-based) or from the structure of the DAG (structure-based). The intuition is that terms in lower levels of DAG, that is, the terms with lower probability carry more specific information than the terms at higher levels in the hierarchy. Corpus-based methods evaluate the term probability as
$$p(t)= \frac{M}{N}$$
where M is the number of genes annotated by term t and N is the total number of genes in the annotating corpus.


## Data: Human Protein to GO Mapping

GAF: GO annotation files. 

Dataset `goa_human.gaf` downloaded from http://current.geneontology.org/products/pages/downloads.html
> **Filtered Files**
>
> These files are taxon-specific and reflect the work of specific projects, primarily the model organisms database groups, to provide comprehensive, non-redundant annotation files for their organism. All the files in this table have been filtered using the annotation file QC pipeline. A major component to the filtering is the requirement that particular taxon IDs can only be included within the association files provided by specific projects; the current list of authoritative groups and major model organisms can be found below. 

```
Homo sapiens
EBI Gene Ontology Annotation Database (goa) 	protein 	543477 	goa_human.gaf (gzip)
```
Data dictionary: http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/




In [99]:
a_file = gzip. open("functional_sim/data/goa_human.gaf.gz", "rb")
contents = a_file. read()

In [178]:
print(contents.decode('utf-8')[0:1423])

!gaf-version: 2.2
!
!generated-by: GOC
!
!date-generated: 2021-10-27T15:09
!
!Header from source association file:
!
!generated-by: GOC
!
!date-generated: 2021-10-27T04:08
!
!Header from goa_human source association file:
!
!The set of protein accessions included in this file is based on UniProt reference proteomes, which provide one protein per gene.
!They include the protein sequences annotated in Swiss-Prot or the longest TrEMBL transcript if there is no Swiss-Prot record.
!If a particular protein accession is not annotated with GO, then it will not appear in this file.
!
!Note that the annotation set in this file is filtered in order to reduce redundancy; the full, unfiltered set can be found in
!ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gz
!
!date-generated: 2021-06-16 11:28
!generated-by: UniProt
!go-version: http://purl.obolibrary.org/obo/go/releases/2021-06-06/extensions/go-plus.owl
!
!
!Header copied from paint_goa_human_valid.gaf
!Created on Wed Sep  8 

In [135]:
goa = pd.read_csv("functional_sim/data/goa_human.gaf.gz", 
            compression='gzip', 
            header=None,
            skiprows=41, 
            sep='\t')
goa.columns=["DB",
                    "DB Object ID",
                    "DB Object Symbol",
                    "Qualifier",
                    "GO ID",
                    "Reference",
                    "Evidence Code",
                    "With or From",
                    "Aspect",
                    "Name",
                    "Synonym",
                    "Type",
                    "Taxon",
                    "Date",
                    "Assigned By",
                    "Annotation Extension",
                    "Gene Product Form ID"]
goa.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
4,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,,


In [164]:
goa.shape

(609748, 17)

In [149]:
goa["GO ID"].unique().size

18527

In [137]:
goa["DB"].unique()

array(['UniProtKB'], dtype=object)

The GOA human GAF file has 610K rows with protein IDs from UniProt.

In [140]:
goa["Qualifier"].unique()

array(['enables', 'located_in', 'involved_in', 'part_of', 'NOT|enables',
       'NOT|involved_in', 'is_active_in', 'NOT|colocalizes_with',
       'colocalizes_with', 'acts_upstream_of_or_within', 'contributes_to',
       'NOT|located_in', 'NOT|part_of', 'NOT|acts_upstream_of_or_within',
       'acts_upstream_of', 'acts_upstream_of_positive_effect',
       'acts_upstream_of_or_within_positive_effect',
       'acts_upstream_of_or_within_negative_effect', 'NOT|contributes_to',
       'acts_upstream_of_negative_effect',
       'NOT|acts_upstream_of_or_within_negative_effect',
       'NOT|is_active_in'], dtype=object)

In [184]:
len(goa["DB Object ID"].unique())

19788

In [185]:
# DB Object Symbol
len(goa["DB Object Symbol"].unique())

19718

There are about 20,000 unique proteins here. That's a great coverage. 

In [139]:
len(goa["GO ID"].unique())

18527

In [141]:
goa["Evidence Code"].unique()

array(['IEA', 'IDA', 'TAS', 'IPI', 'IEP', 'ISS', 'NAS', 'IMP', 'ISA',
       'HDA', 'EXP', 'ND', 'HEP', 'IC', 'RCA', 'HMP', 'IGI', 'IKR', 'IGC',
       'ISO', 'ISM', 'IBA'], dtype=object)

Taxonomy should be human. Some proteins may be found in multiple organisms other than human, but in the end, every data point in this dataset is in some way related to human.

In [183]:
goa["Taxon"].unique()[0:10]

array(['taxon:9606', 'taxon:9606|taxon:1280', 'taxon:9606|taxon:33892',
       'taxon:9606|taxon:11103', 'taxon:9606|taxon:11052',
       'taxon:9606|taxon:562', 'taxon:9606|taxon:197911',
       'taxon:9606|taxon:90370', 'taxon:9606|taxon:31649',
       'taxon:9606|taxon:1313'], dtype=object)

In [182]:
[taxon for taxon in goa["Taxon"].unique() if '9606' not in taxon]

[]

**QC** 

Can I find all GO in the human GOA dataset within GO BASIC?


I downloaded the GO Term hierarchy. The file I downloaded is `go-basic.obo` from http://geneontology.org/docs/download-ontology/ 

Description of the dataset from the source:
> This is the basic version of the GO, filtered such that the graph is guaranteed to be acyclic and annotations can be propagated up the graph. The relations included are is a, part of, regulates, negatively regulates and positively regulates. This version excludes relationships that cross the 3 GO hierarchies. This version should be used with most GO-based annotation tools.
go.obo and go.

In [None]:
import obonet
import networkx as nx
gobasic = obonet.read_obo("functional_sim/data/go-basic.obo")

In [290]:
goa_goid_set = set(goa["GO ID"])
gobasic_set = set(gobasic.nodes)

In [291]:
goa_goid_set.difference(gobasic_set)

set()

GOA is a subset of gobasic, which is the full graph. 

In [292]:
len(gobasic_set.difference(goa_goid_set))

25323

I want to work only with GOMF. 

In [None]:
r_gobasic = nx.reverse_view(gobasic_sub)

# 'GO:0003674' - This is the code for molecular function, which is the topmost parent
# I'm interested in all the nodes that are 1 distance away from the topmost parent. 


In [None]:
# # Do not run again if already calculated. Import pickle file. 
# shortest_from_root = dict(nx.all_pairs_shortest_path_length(r_gobasic))
# with open('functional_sim/intermediary_data/shortest_from_root.pkl', 'wb') as file:
#     pickle.dump(shortest_from_root, file)

with open('functional_sim/intermediary_data/shortest_from_root.pkl', 'rb') as file:
    shortest_from_root = pickle.load(file)    
    

In [293]:
# How many GO's are in the shortest paths? 
len(shortest_from_root)

43850

In [294]:
# Here are how many eventually connect to our root, molecular_function.
len(shortest_from_root['GO:0003674'])

11168

There's a total of 11,168 GOMF terms in the GO-basic graph. In the human GAF dataset, there are only about 4,000. In addition to the species filter, the GAF dataset was filtered further through its QC checks. See [Gene Ontology wiki](http://wiki.geneontology.org/index.php/Release_Pipeline#Annotation_QC_checks) for more info.

In [295]:
# MF + BP + CC

len(shortest_from_root['GO:0003674']
   )+len(shortest_from_root['GO:0008150']
        )+len(shortest_from_root['GO:0005575'])

43850

43850  total GOs covered. These three sub-ontologies cover all the gene ontologies! And these are mutually exclusive categories, since, for this dataset, the cross-sub-ontology relationships have been removed.


In [193]:
# GOMF only 

goa_goid_mf = [goid for goid in set(goa["GO ID"]) if goid in shortest_from_root['GO:0003674'] ]
len(goa_goid_mf)

4431

In [194]:
goa_goid_mf[0:10]

['GO:0005412',
 'GO:0035254',
 'GO:0005035',
 'GO:0031690',
 'GO:0046980',
 'GO:1990247',
 'GO:0050567',
 'GO:0052630',
 'GO:0052814',
 'GO:0003943']

# Implement Functional Similarity Formula

> M is the number of genes annotated by term t

In [356]:
M = goa[goa['GO ID'].isin(goa_goid_mf)].pivot_table(index='GO ID',
                values='DB Object ID',
                aggfunc=pd.Series.nunique
               ).to_dict()['DB Object ID']

In [357]:
M['GO:0000009']

2

In [213]:
len(M)

4431

> N is the total number of genes in the annotating corpus.

In [358]:
N = len(goa[goa['GO ID'].isin(goa_goid_mf)]['DB Object ID'].unique())
N

18240

> The information content (IC) of a GO term t is defined by negative log-likelihood:
$$IC(t)=-log(p(t))$$
> where term probability P(t) of term t is determined from the annotations of the corpus (corpus-based) or from the structure of the DAG (structure-based). [...] Corpus-based methods evaluate the term probability as
$$p(t)= \frac{M}{N}$$

In [221]:
IC_t = {
    t: -np.log(m/N) for t, m in M.items()
}

In [297]:
IC_t['GO:0001010']

9.808847148382007

In [298]:
len(IC_t)

4431

> **Functional similarity measures between two genes**
> 
> Functional similarity (FS) between two genes is computed using the ICs of individual terms (term-based) or the semantic similarities between the pairs of terms (term pair-based) or among the set of terms (term set-based). Let 𝑇𝑔1
and 𝑇𝑔2 be the set of GO terms annotating genes g1 and g2, respectively. Term-based measures such as GIC1 (Jaccard index), DIC39 (dice index), and UIC39 (universal index) are defined using ICs of individual terms:
>
> GIC
$$FS({g}_{1},\,{g}_{2})=\frac{{\sum }_{t\in {T}_{{g}_{1}}\cap {T}_{{g}_{2}}}IC(t)}{{\sum }_{t\in {T}_{{g}_{1}}\cup {T}_{{g}_{2}}}IC(t)}$$


For each protein (instead of gene) find the set of relevant terms.

In [359]:
goa_by_protein = goa[goa['GO ID'].isin(goa_goid_mf)].pivot_table(
    index=["DB Object ID"],
    values=["GO ID"],
    aggfunc=lambda x:set(x)
).to_dict()['GO ID']

goa_by_protein['A0A087WT57']

{'GO:0019901'}

In [300]:
len(goa_by_protein)

18193

## Sample Functional Similarity

In [365]:
# Sample proteins

protein_A = 'Q5TAX3'
protein_B = 'Q5TB30'

In [366]:
goa_by_protein[protein_A]

{'GO:0003723',
 'GO:0005515',
 'GO:0008270',
 'GO:0016779',
 'GO:0035198',
 'GO:0050265'}

In [367]:
gobasic['GO:0071714']

AdjacencyView({'GO:0022857': {'is_a': {}}})

In [368]:
gobasic['GO:0022857']

AdjacencyView({'GO:0005215': {'is_a': {}}})

In [369]:
goa_by_protein[protein_B]

{'GO:0005096', 'GO:0005515'}

In [370]:
# Intersection of the terms
goa_by_protein[protein_A].intersection(goa_by_protein[protein_B])

{'GO:0005515'}

In [427]:
ev.jaccard_sim_protein_go(protein_A, protein_B, goa_by_protein, IC_t)

0.011927170492379131

# Run on model cluster

In [312]:
prefix='model_outputs/no_cluster_size_limit/'
clusters = gcs.download_pkl(prefix+'B2_clusters.pkl')
all_protein_combos_per_cluster = gcs.download_parquet(prefix+'B2-HDBSCAN-SeqVec-all_protein_combos_per_cluster.parquet')

In [428]:
# importlib.reload(ev)
cluster_funsim = ev.funsim(all_protein_combos_per_cluster, goa=goa)

Total number of proteins in GO annotations: 18240


In [429]:
cluster_funsim.shape

(757, 4)

In [12]:
clusters.cluster_label.unique().shape

NameError: name 'clusters' is not defined

# Dictionary of GO terms per cluster

In [19]:
goa = pd.read_csv( io.BytesIO(gcs.download_blob("functional_sim/data/goa_human.gaf.gz")), 
                    compression='gzip', 
                    header=None,
                    skiprows=41,    # hard-coded. May be different for other gaf files.
                    sep='\t')
goa.columns=["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "Reference", 
             "Evidence Code", "With or From", "Aspect", "Name", "Synonym", "Type", 
             "Taxon", "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [20]:
goa

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,Reference,Evidence Code,With or From,Aspect,Name,Synonym,Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210612,UniProt,,
4,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609743,UniProtKB,P02647,APOA1,involved_in,GO:0033700,PMID:21873635,IBA,PANTHER:PTN002709027|UniProtKB:P02647|UniProtK...,P,Apolipoprotein A-I,APOA1,protein,taxon:9606,20170228,GO_Central,,
609744,UniProtKB,Q9UKT7,FBXL3,involved_in,GO:0031146,PMID:21873635,IBA,PANTHER:PTN001923412|UniProtKB:Q9UKT7|MGI:MGI:...,P,F-box/LRR-repeat protein 3,FBXL3|FBL3A|FBXL3A,protein,taxon:9606,20200808,GO_Central,,
609745,UniProtKB,Q9Y2I2,NTNG1,involved_in,GO:0070831,PMID:21873635,IBA,PANTHER:PTN000832288|ZFIN:ZDB-GENE-081030-4|FB...,P,Netrin-G1,NTNG1|KIAA0976|LMNT1|UNQ571/PRO1133,protein,taxon:9606,20190301,GO_Central,,
609746,UniProtKB,Q9BXG8,SPZ1,is_active_in,GO:0005634,PMID:21873635,IBA,PANTHER:PTN001141066|MGI:MGI:1930801,C,Spermatogenic leucine zipper protein 1,SPZ1|TSP1,protein,taxon:9606,20170428,GO_Central,,


In [24]:
import pickle
with open('functional_sim/intermediary_data/shortest_from_root.pkl', 'rb') as file:
        shortest_from_root = pickle.load(file)    
        goa_goid_mf = [goid for goid in set(goa["GO ID"]) if goid in shortest_from_root['GO:0003674'] ]


In [150]:
len(goa_goid_mf)

4431

In [30]:
# Find all unique proteins
unique_proteins = all_protein_combos_per_cluster[["query_protein", "cluster"]].drop_duplicates()
unique_proteins.shape

(8925, 2)

In [151]:
goa['GO ID'][goa['GO ID'].isin(goa_goid_mf)].unique().shape

(4431,)

In [33]:
goa_by_protein = goa[goa['GO ID'].isin(goa_goid_mf)].pivot_table(
                            index=["DB Object ID"],
                            values=["GO ID"],
                            aggfunc=lambda x:set(x)
                        ).to_dict()['GO ID']
    

In [34]:
goa_by_protein

{'A0A024RBG1': {'GO:0000298',
  'GO:0003723',
  'GO:0008486',
  'GO:0034431',
  'GO:0034432',
  'GO:0046872',
  'GO:0050072',
  'GO:0052840',
  'GO:0052842'},
 'A0A075B6Q5': {'GO:0003823', 'GO:0034987'},
 'A0A075B6R2': {'GO:0003823', 'GO:0034987'},
 'A0A075B6T6': {'GO:0042605'},
 'A0A075B6W5': {'GO:0042605'},
 'A0A075B734': {'GO:0015204', 'GO:0015250', 'GO:0015254', 'GO:0015267'},
 'A0A075B759': {'GO:0003755', 'GO:0016018'},
 'A0A075B767': {'GO:0003755', 'GO:0016018'},
 'A0A075B7B6': {'GO:0003823', 'GO:0034987'},
 'A0A075B7B8': {'GO:0003823', 'GO:0034987'},
 'A0A075B7D0': {'GO:0003823', 'GO:0034987'},
 'A0A075B7D8': {'GO:0003823', 'GO:0034987'},
 'A0A075B7E8': {'GO:0003823', 'GO:0034987'},
 'A0A075B7F0': {'GO:0003823', 'GO:0034987'},
 'A0A075B7F1': {'GO:0003823', 'GO:0034987'},
 'A0A087WSY4': {'GO:0003823', 'GO:0034987'},
 'A0A087WT01': {'GO:0005515'},
 'A0A087WT57': {'GO:0019901'},
 'A0A087WTH5': {'GO:0005251', 'GO:0015459', 'GO:0044325', 'GO:1902282'},
 'A0A087WUU8': {'GO:0000977', '

In [100]:
def get_list_val(x):
    try:
        return np.array(list(goa_by_protein[x]) )
    except:
        return np.array([])

In [101]:
unique_proteins["go"] = unique_proteins["query_protein"].apply(get_list_val)

In [102]:
unique_proteins.head()

Unnamed: 0,query_protein,cluster,go
1,O96009,0,"[GO:0004175, GO:0008233, GO:0005515, GO:0004190]"
10,P00797,0,"[GO:0005102, GO:0008233, GO:0005515, GO:000515..."
20,P07339,0,"[GO:0070001, GO:0008233, GO:0005515, GO:000419..."
30,P0DJD7,0,"[GO:0005515, GO:0004190]"
40,P0DJD8,0,[GO:0004190]


Check that missing proteins were handled properly. 

In [103]:
unique_proteins[unique_proteins["go"].map(len)==0]

Unnamed: 0,query_protein,cluster,go
1,A0A075B6S0,1,[]
24,A0A075B6V2,1,[]
48,A0A075B6Y3,1,[]
72,A0A075B6Y9,1,[]
96,A0A075B700,1,[]
...,...,...,...
9159,Q86VV8,755,[]
9417,Q86XA9,755,[]
10191,Q8N201,755,[]
10578,Q8NDA8,755,[]


Now I have unique proteins and the full list of GO terms that map to each protein. Let's look at cluster 2.

In [128]:
for stuff in unique_proteins[unique_proteins.cluster==2].go:
    print(stuff)

['GO:0030280' 'GO:0046914' 'GO:0005515' 'GO:0005509' 'GO:0005198']
['GO:0005198' 'GO:0046914' 'GO:0005509']
['GO:0046914' 'GO:0005509']
['GO:0046914' 'GO:0005509']
['GO:0005515' 'GO:0003723']
['GO:0048306' 'GO:0046914' 'GO:0005515' 'GO:0005509']


Usint `np.hstack` I can get a full list of these GO items in the cluster for all the proteins in that cluster.

In [129]:
np.hstack(unique_proteins[unique_proteins.cluster==2].go)

array(['GO:0030280', 'GO:0046914', 'GO:0005515', 'GO:0005509',
       'GO:0005198', 'GO:0005198', 'GO:0046914', 'GO:0005509',
       'GO:0046914', 'GO:0005509', 'GO:0046914', 'GO:0005509',
       'GO:0005515', 'GO:0003723', 'GO:0048306', 'GO:0046914',
       'GO:0005515', 'GO:0005509'], dtype='<U10')

Then, I can use `np.unique()` and `zip` to turn the GO count into a dictionary.

In [143]:
go_ct = np.vstack(
        np.unique(np.hstack(unique_proteins[unique_proteins.cluster==2].go),
                  return_counts = True
                 ))

pairs = list(zip(go_ct[0],
           go_ct[1]))
go_ct_dict = {go: int(ct) for go, ct in pairs}
go_ct_dict

{'GO:0003723': 1,
 'GO:0005198': 2,
 'GO:0005509': 5,
 'GO:0005515': 3,
 'GO:0030280': 1,
 'GO:0046914': 5,
 'GO:0048306': 1}

In [144]:
def make_go_ct_dict(go_terms):
    go_ct = np.vstack(np.unique(np.hstack(go_terms), return_counts = True))

    pairs = list(zip(go_ct[0],
               go_ct[1]))
    go_ct_dict = {go: int(ct) for go, ct in pairs}
    
    return go_ct_dict

In [145]:
test = unique_proteins.pivot_table(
    index='cluster',
    values='go',
    aggfunc=make_go_ct_dict
)

In [146]:
test.iloc[0]["go"]

{'GO:0001540': 1,
 'GO:0004175': 2,
 'GO:0004190': 10,
 'GO:0004197': 1,
 'GO:0005102': 1,
 'GO:0005159': 1,
 'GO:0005515': 6,
 'GO:0008233': 5,
 'GO:0008798': 1,
 'GO:0019899': 1,
 'GO:0042802': 1,
 'GO:0070001': 1}

But I also want the go term names. 

From http://geneontology.org/docs/faq/

> How do I get the term names for my list of GO ids?
> 
> You can use the YeastMine Analyze tool available at SGD to retrieve the GO term names for each ID.
> 
>     Go to the Analyze tool on YeastMine
    In the Select Type pull down, select GO Term
    Enter your GO ids or upload a list in the full format (GO:0016020, GO:0016301…)
    Click on Create List. The tool offers several options to download the list.



In [156]:
len(goa_goid_mf)

4431

In [155]:
pd.DataFrame(goa_goid_mf).to_csv('functional_sim/goa_goid_mf.csv', index=False, header=False)

I uploaded `goa_goid_mf.csv` onto YeastMine and downloaded `yeastmine_results_goa_goid_mf.tsv`.

In [172]:
go_term_names = pd.read_csv('functional_sim/data/yeastmine_results_goa_goid_mf.tsv', sep='\t')
go_term_names.columns = [col[10:] for col in go_term_names.columns]
go_term_names.shape

(4431, 4)

In [173]:
go_term_names.head()

Unnamed: 0,Identifier,Name,Namespace,Description
0,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...
1,GO:0000010,trans-hexaprenyltranstransferase activity,molecular_function,Catalysis of the reaction: all-trans-hexapreny...
2,GO:0000014,single-stranded DNA endodeoxyribonuclease acti...,molecular_function,Catalysis of the hydrolysis of ester linkages ...
3,GO:0000016,lactase activity,molecular_function,Catalysis of the reaction: lactose + H2O = D-g...
4,GO:0000026,"alpha-1,2-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...


In [181]:
go_term_names_dict = go_term_names.set_index("Identifier").to_dict()

In [183]:
go_term_names_dict["Name"]["GO:0000009"]

'alpha-1,6-mannosyltransferase activity'

In [206]:
def map_go_desc(go_ct_dict):
    new_go_ct_dict = {}
    temp_dict={}
    for go_identifier, ct_protein in go_ct_dict.items():
        new_go_ct_dict[go_identifier]= {}
        new_go_ct_dict[go_identifier]["Num. Protein"] = ct_protein
        new_go_ct_dict[go_identifier]["Name"] = go_term_names_dict["Name"][go_identifier]
        new_go_ct_dict[go_identifier]["Description"] = go_term_names_dict["Description"][go_identifier]
    
    return {k: v for k, v in
        sorted(new_go_ct_dict.items(), key=lambda item: item[1]["Num. Protein"], reverse=True)}

In [207]:
new_go_ct_dict = map_go_desc(go_ct_dict)

In [208]:
new_go_ct_dict

{'GO:0005509': {'Num. Protein': 5,
  'Name': 'calcium ion binding',
  'Description': 'Binding to a calcium ion (Ca2+).'},
 'GO:0046914': {'Num. Protein': 5,
  'Name': 'transition metal ion binding',
  'Description': 'Binding to a transition metal ions; a transition metal is an element whose atom has an incomplete d-subshell of extranuclear electrons, or which gives rise to a cation or cations with an incomplete d-subshell. Transition metals often have more than one valency state. Biologically relevant transition metals include vanadium, manganese, iron, copper, cobalt, nickel, molybdenum and silver.'},
 'GO:0005515': {'Num. Protein': 3,
  'Name': 'protein binding',
  'Description': 'Binding to a protein.'},
 'GO:0005198': {'Num. Protein': 2,
  'Name': 'structural molecule activity',
  'Description': 'The action of a molecule that contributes to the structural integrity of a complex or its assembly within or outside a cell.'},
 'GO:0003723': {'Num. Protein': 1,
  'Name': 'RNA binding',


In [209]:
len(new_go_ct_dict)

7

In [215]:
pd.DataFrame.from_dict(new_go_ct_dict, orient='index')

Unnamed: 0,Num. Protein,Name,Description
GO:0005509,5,calcium ion binding,Binding to a calcium ion (Ca2+).
GO:0046914,5,transition metal ion binding,Binding to a transition metal ions; a transiti...
GO:0005515,3,protein binding,Binding to a protein.
GO:0005198,2,structural molecule activity,The action of a molecule that contributes to t...
GO:0003723,1,RNA binding,Binding to an RNA molecule or a portion thereof.
GO:0030280,1,structural constituent of skin epidermis,The action of a molecule that contributes to t...
GO:0048306,1,calcium-dependent protein binding,Binding to a protein or protein complex in the...


In [192]:
for k, v in new_go_ct_dict.items():
    print(k, v["Num. Protein"])

GO:0003723 1
GO:0005198 2
GO:0005509 5
GO:0005515 3
GO:0030280 1
GO:0046914 5
GO:0048306 1


In [211]:
test["go_summary"] = test["go"].map(map_go_desc)
test.head()

Unnamed: 0_level_0,go,go_summary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"{'GO:0001540': 1, 'GO:0004175': 2, 'GO:0004190...","{'GO:0004190': {'Num. Protein': 10, 'Name': 'a..."
1,{},{}
2,"{'GO:0003723': 1, 'GO:0005198': 2, 'GO:0005509...","{'GO:0005509': {'Num. Protein': 5, 'Name': 'ca..."
3,"{'GO:0003674': 1, 'GO:0004252': 5, 'GO:0005515...","{'GO:0004252': {'Num. Protein': 5, 'Name': 'se..."
4,"{'GO:0003674': 3, 'GO:0005515': 4}","{'GO:0005515': {'Num. Protein': 4, 'Name': 'pr..."


In [220]:
test = gcs.download_pkl('functional_sim/shortest_from_root.pkl')

In [221]:
test

{'GO:0000001': {'GO:0000001': 0},
 'GO:0000002': {'GO:0000002': 0,
  'GO:0032042': 1,
  'GO:0110166': 2,
  'GO:1905951': 2,
  'GO:0032043': 2,
  'GO:0043504': 2},
 'GO:0000003': {'GO:0000003': 0,
  'GO:0075325': 1,
  'GO:0019953': 1,
  'GO:0022414': 1,
  'GO:0032505': 1,
  'GO:0019954': 1,
  'GO:0032504': 1,
  'GO:0051321': 2,
  'GO:0000747': 2,
  'GO:0060781': 2,
  'GO:0000743': 2,
  'GO:0061948': 2,
  'GO:0060469': 2,
  'GO:0060468': 2,
  'GO:0060478': 2,
  'GO:0060738': 2,
  'GO:0018985': 2,
  'GO:0022602': 2,
  'GO:0000742': 2,
  'GO:0035471': 2,
  'GO:0060476': 2,
  'GO:1903046': 2,
  'GO:0000909': 2,
  'GO:0060011': 2,
  'GO:0048573': 2,
  'GO:1905261': 2,
  'GO:0032219': 2,
  'GO:0035039': 2,
  'GO:0007343': 2,
  'GO:0048497': 2,
  'GO:0060474': 2,
  'GO:0007340': 2,
  'GO:0043093': 2,
  'GO:0003006': 2,
  'GO:0007618': 2,
  'GO:0060722': 2,
  'GO:0000905': 2,
  'GO:0007117': 2,
  'GO:0035040': 2,
  'GO:0000752': 2,
  'GO:0046595': 2,
  'GO:0009566': 2,
  'GO:0071515': 2,
  'GO:

In [226]:
pd.read_csv( io.StringIO(gcs.download_text('functional_sim/data/yeastmine_results_goa_goid_mf.tsv')),
             sep='\t'
            )

Unnamed: 0,GO Term > Identifier,GO Term > Name,GO Term > Namespace,GO Term > Description
0,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...
1,GO:0000010,trans-hexaprenyltranstransferase activity,molecular_function,Catalysis of the reaction: all-trans-hexapreny...
2,GO:0000014,single-stranded DNA endodeoxyribonuclease acti...,molecular_function,Catalysis of the hydrolysis of ester linkages ...
3,GO:0000016,lactase activity,molecular_function,Catalysis of the reaction: lactose + H2O = D-g...
4,GO:0000026,"alpha-1,2-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...
...,...,...,...,...
4426,GO:1990955,G-rich single-stranded DNA binding,molecular_function,"Binding to G-rich, single-stranded DNA."
4427,GO:1990984,tRNA demethylase activity,molecular_function,Catalysis of the removal of a methyl group fro...
4428,GO:2001065,mannan binding,molecular_function,Binding to mannan.
4429,GO:2001069,glycogen binding,molecular_function,Binding to glycogen.


In [30]:
from datetime import datetime

In [65]:
class funsim_evaluator():
    def __init__(self, all_protein_combos_per_cluster, goa=None):
        self.all_protein_combos_per_cluster = all_protein_combos_per_cluster
        self.goa = goa 
        
        self.go_term_names = pd.read_csv(io.StringIO(
            gcs.download_text('functional_sim/data/yeastmine_results_goa_goid_mf.tsv')),
            sep='\t'
            )
        self.go_term_names.columns = [col[10:] for col in self.go_term_names.columns]
        self.go_term_names_dict = self.go_term_names.set_index("Identifier").to_dict()
        
        # If goa df is not provided, download from GCS 
        if isinstance(self.goa, type(None)):
            print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "No GO annotations provided. Downloading from google cloud.")
            self.goa = pd.read_csv( io.BytesIO(gcs.download_blob("functional_sim/data/goa_human.gaf.gz")), 
                                compression='gzip', 
                                header=None,
                                skiprows=41,    # hard-coded. May be different for other gaf files.
                                sep='\t')
            self.goa.columns=["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "Reference", 
                         "Evidence Code", "With or From", "Aspect", "Name", "Synonym", "Type", 
                         "Taxon", "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]


        ##################
        # Calculate IC (information content) of each term

        # Identify molecular functions in GO
        self.shortest_from_root = gcs.download_pkl('functional_sim/shortest_from_root.pkl')
        self.goa_goid_mf = [goid for goid in set(self.goa["GO ID"]) if goid in self.shortest_from_root['GO:0003674'] ]

        # IC calculation 
        self.M = self.goa[self.goa['GO ID'].isin(self.goa_goid_mf)].pivot_table(index='GO ID',
                    values='DB Object ID',
                    aggfunc=pd.Series.nunique
                   ).to_dict()['DB Object ID']

        self.N = len(self.goa[self.goa['GO ID'].isin(self.goa_goid_mf)]['DB Object ID'].unique())
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Total number of proteins in GO annotations:", self.N)

        self.IC_t = {t: -np.log(m/self.N) for t, m in self.M.items()}
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "IC_t created")

        ##################

        # Lookup dictionary of proteins and their GO terms 
        self.goa_by_protein = self.goa[self.goa['GO ID'].isin(self.goa_goid_mf)].pivot_table(
                                index=["DB Object ID"],
                                values=["GO ID"],
                                aggfunc=lambda x:set(x)
                            ).to_dict()['GO ID']
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Dictionary of proteins and their GO terms lookup created")
        

        ##################
        # Eliminate duplicates in the pairs of proteins from cluster output, 
        # since the jaccard pairwise metric is symmetrical.

        self.all_protein_combos_per_cluster['protein_A'] = self.all_protein_combos_per_cluster[
            ['query_protein','target_protein']].min(axis=1)

        self.all_protein_combos_per_cluster['protein_B'] = self.all_protein_combos_per_cluster[
            ['query_protein','target_protein']].max(axis=1)

        self.protein_pair_funsim = self.all_protein_combos_per_cluster[
            ['protein_A', 'protein_B', 'cluster']].drop_duplicates()



    def funsim(self):
        """
        Find functional similarities for all protein pairs in each cluster 

        Inputs:
            - self.all_protein_combos_per_cluster: possible protein pair combinations per cluster 
            - goa: Gene ontology annotation file that maps proteins to gene ontology 
        """


        ##################
        # Find Jaccard sim

        self.protein_pair_funsim['funsim'] = \
            self.protein_pair_funsim.apply(
                lambda x: self.jaccard_sim_protein_go(x['protein_A'], x['protein_B']), 
                axis=1
            )
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Funsim calculated.")

        # Pivot by cluster 
        self.cluster_funsim = self.protein_pair_funsim.pivot_table(
            index="cluster",
            values="funsim",
            aggfunc=[len, "count", np.mean]
        )
        self.cluster_funsim.columns = ["num_pairs", "num_pairs_with_funsim", "funsim"]
        self.cluster_funsim["perc_pairs_w_funsim"] = self.cluster_funsim.num_pairs_with_funsim/self.cluster_funsim.num_pairs

        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Funsim summary by cluster done.")

        #################
        # Identify top common GO terms per cluster 
        cluster_common_go = self.common_go_in_cluster()
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Common GO term sumary per cluster processed.")
        
        self.cluster_funsim = self.cluster_funsim.merge(cluster_common_go,
                                                       left_index=True,
                                                        right_index=True)
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Merged cluster-level funsim score with GO summary.")


#         return self.cluster_funsim, self.protein_pair_funsim



    def jaccard_sim_protein_go(self, protein_A, protein_B):
        """Calculate the GIC or the Jaccard index of terms between two proteins.
        https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S5-S4

        - self.goa_by_protein: Dictionary where key is protein ID and value is list of GO annotation terms for that protein
        - self.IC_t: Dictionary where key is GO term and value is its information content (IC)

        """
        if protein_A not in self.goa_by_protein or protein_B not in self.goa_by_protein:
            return None

        go_intersection = self.goa_by_protein[protein_A].intersection(self.goa_by_protein[protein_B])
        go_union        = self.goa_by_protein[protein_A].union(       self.goa_by_protein[protein_B])

        numerator = 0
        denominator = 0

        for goid in go_intersection:
            numerator += self.IC_t[goid]

        denominator = numerator
        for goid in go_union - go_intersection:
            denominator += self.IC_t[goid]

        return numerator/denominator

    
    def common_go_in_cluster(self):
        """ 
        For each cluster, returns the list of GO terms that are associated with all proteins
        in that cluster and provides a summary statistic. 
        """

        self.unique_proteins = self.all_protein_combos_per_cluster[["query_protein", "cluster"]].drop_duplicates()

        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Get NP Arr of GO terms for each protein")
        self.unique_proteins["go"] = self.unique_proteins["query_protein"].apply(self.get_nparr_of_go_terms)

        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Turn GO terms into dict")
        cluster_info = self.unique_proteins.pivot_table(
                                    index='cluster',
                                    values='go',
                                    aggfunc=self.make_go_ct_dict
                                )

        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Map GO desc...")
        cluster_info["go_summary"] = cluster_info["go"].map(self.map_go_desc)
        print(datetime.now().strftime("%Y-%b-%d %H:%M:%S"), "Mapping GO desc done.")

        return cluster_info.reset_index()
    
    def get_nparr_of_go_terms(self, protein_id):
        '''
        Looks up protein_id from self.goa_by_protein and returns the list of GO terms as a numpy array.
        '''
        try:
            return np.array(list(self.goa_by_protein[protein_id]) )
        except:
            return np.array([])

    def make_go_ct_dict(self, go_terms):
        '''
        Given a list of GO terms, stack them all together and return a dictionary
        where the key is GO values and the value is their counts.
        '''
        go_ct = np.vstack(np.unique(np.hstack(go_terms), return_counts = True))
        pairs = list(zip(go_ct[0], go_ct[1]))
        go_ct_dict = {go: int(ct) for go, ct in pairs}

        return go_ct_dict

    def map_go_desc(self, go_ct_dict):
        '''
        Given a dictionary containing just go_id and count of proteins, 
        pull in GO name and description as well. Return a dictionary where
        the key is GO ID and the value is a dictionary containing 
        num. proteins, GO name, and GO desc. 
        '''

        new_go_ct_dict = {}
        temp_dict={}
        for go_identifier, ct_protein in go_ct_dict.items():
            new_go_ct_dict[go_identifier]= {}
            new_go_ct_dict[go_identifier]["Num. Protein"] = ct_protein
            new_go_ct_dict[go_identifier]["Name"] = self.go_term_names_dict["Name"][go_identifier]
            new_go_ct_dict[go_identifier]["Description"] = self.go_term_names_dict["Description"][go_identifier]

        return {k: v for k, v in
            sorted(new_go_ct_dict.items(), key=lambda item: item[1]["Num. Protein"], reverse=True)}


In [66]:
testclass = funsim_evaluator(all_protein_combos_per_cluster)

2021-Nov-07 15:34:49 No GO annotations provided. Downloading from google cloud.


  testclass = funsim_evaluator(all_protein_combos_per_cluster)


2021-Nov-07 15:34:53 Total number of proteins in GO annotations: 18240
2021-Nov-07 15:34:53 IC_t created
2021-Nov-07 15:34:53 Dictionary of proteins and their GO terms lookup created


2021-Nov-07 15:34:56 Funsim calculated.
2021-Nov-07 15:34:56 Funsim summary by cluster done.
2021-Nov-07 15:34:56 Get NP Arr of GO terms for each protein
2021-Nov-07 15:34:56 Turn GO terms into dict
2021-Nov-07 15:34:56 Map GO desc...
2021-Nov-07 15:34:56 Mapping GO desc done.
2021-Nov-07 15:34:56 Common GO term sumary per cluster processed.
2021-Nov-07 15:34:56 Merged cluster-level funsim score with GO summary.


(         num_pairs  num_pairs_with_funsim    funsim  perc_pairs_w_funsim  \
 cluster                                                                    
 0               45                     45  0.477738             1.000000   
 1              276                      0       NaN             0.000000   
 2               15                     15  0.408240             1.000000   
 3               10                     10  0.724022             1.000000   
 4               15                     15  0.407159             1.000000   
 ...            ...                    ...       ...                  ...   
 752             28                     28  0.127296             1.000000   
 753            276                    153  0.167738             0.554348   
 754             45                     45  0.397762             1.000000   
 755           8256                   6555  0.076184             0.793968   
 756             15                     15  0.242177             1.000000   

In [68]:
cluster_funsim, protein_pair_funsim = testclass.cluster_funsim, testclass.protein_pair_funsim

In [69]:
cluster_funsim

Unnamed: 0_level_0,num_pairs,num_pairs_with_funsim,funsim,perc_pairs_w_funsim,cluster,go,go_summary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,45,45,0.477738,1.000000,0,"{'GO:0001540': 1, 'GO:0004175': 2, 'GO:0004190...","{'GO:0004190': {'Num. Protein': 10, 'Name': 'a..."
1,276,0,,0.000000,1,{},{}
2,15,15,0.408240,1.000000,2,"{'GO:0003723': 1, 'GO:0005198': 2, 'GO:0005509...","{'GO:0005509': {'Num. Protein': 5, 'Name': 'ca..."
3,10,10,0.724022,1.000000,3,"{'GO:0003674': 1, 'GO:0004252': 5, 'GO:0005515...","{'GO:0004252': {'Num. Protein': 5, 'Name': 'se..."
4,15,15,0.407159,1.000000,4,"{'GO:0003674': 3, 'GO:0005515': 4}","{'GO:0005515': {'Num. Protein': 4, 'Name': 'pr..."
...,...,...,...,...,...,...,...
752,28,28,0.127296,1.000000,752,"{'GO:0000822': 1, 'GO:0003682': 2, 'GO:0005219...","{'GO:0005515': {'Num. Protein': 6, 'Name': 'pr..."
753,276,153,0.167738,0.554348,753,"{'GO:0000049': 1, 'GO:0001784': 1, 'GO:0003674...","{'GO:0005515': {'Num. Protein': 14, 'Name': 'p..."
754,45,45,0.397762,1.000000,754,"{'GO:0003674': 1, 'GO:0005085': 10, 'GO:000509...","{'GO:0005085': {'Num. Protein': 10, 'Name': 'g..."
755,8256,6555,0.076184,0.793968,755,"{'GO:0000049': 2, 'GO:0000339': 1, 'GO:0000340...","{'GO:0005515': {'Num. Protein': 98, 'Name': 'p..."


In [70]:
protein_pair_funsim

Unnamed: 0,protein_A,protein_B,cluster,funsim
1,O96009,P00797,0,0.428994
2,O96009,P07339,0,0.383279
3,O96009,P0DJD7,0,0.396631
4,O96009,P0DJD8,0,0.374980
5,O96009,P0DJD9,0,0.374980
...,...,...,...,...
16,Q8IWV7,Q9BVR0,756,0.000000
17,Q8IWV7,Q9Y4D8,756,0.019608
22,Q8IWV8,Q9BVR0,756,0.000000
23,Q8IWV8,Q9Y4D8,756,0.019608
