# Creating a Molecular Function Reference for OmicsIntegrator2

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict

def flatten(list_of_lists): return [item for sublist in list_of_lists for item in sublist]

import mygene

In [2]:
genes = pd.read_csv('../../GONN/GO/molecular_function.csv')
genes.head()

Unnamed: 0,GeneSymbol,GO_ID,GO_term,Evidence
0,A1BG,GO:0003674,molecular_function,ND
1,A2M,GO:0002020,protease binding,IPI
2,A2M,GO:0004867,serine-type endopeptidase inhibitor activity,IDA
3,A2M,GO:0005096,GTPase activator activity,TAS
4,A2M,GO:0005102,signaling receptor binding,IMP


## I. Evidence Codes

Copied from http://geneontology.org/page/guide-go-evidence-codes

### Experimental Evidence codes
Use of an experimental evidence code in a GO annotation indicates that the cited paper displayed results from a physical characterization of a gene or gene product that has supported the association of a GO term. The Experimental Evidence codes are:

- Inferred from Experiment (EXP)
- Inferred from Direct Assay (IDA)
- Inferred from Physical Interaction (IPI)
- Inferred from Mutant Phenotype (IMP)
- Inferred from Genetic Interaction (IGI)
- Inferred from Expression Pattern (IEP)

### High Throughput (HTP) evidence codes
High throughput (HTP) evidence codes may be used to make annotations based upon high throughput methodologies. Use of HTP evidence codes should be carefully considered and follow the GOC's guidelines for their use. The High Throughput Evidence Codes are:

- Inferred from High Throughput Experiment (HTP)
- Inferred from High Throughput Direct Assay (HDA)
- Inferred from Hight Throughput Mutant Phenotype (HMP)
- Inferred from High Throughput Genetic Interaction (HGI)
- Inferred from High Throughput Expression Pattern (HEP)

### Computational Analysis evidence codes
Use of the computational analysis evidence codes indicates that the annotation is based on an in silico analysis of the gene sequence and/or other data as described in the cited reference. The evidence codes in this category also indicate a varying degree of curatorial input. The Computational Analysis evidence codes are:

- Inferred from Sequence or structural Similarity (ISS)
- Inferred from Sequence Orthology (ISO)
- Inferred from Sequence Alignment (ISA)
- Inferred from Sequence Model (ISM)
- Inferred from Genomic Context (IGC)
- Inferred from Biological aspect of Ancestor (IBA)
- Inferred from Biological aspect of Descendant (IBD)
- Inferred from Key Residues (IKR)
- Inferred from Rapid Divergence (IRD)
- Inferred from Reviewed Computational Analysis (RCA)

### Author statement evidence codes
Author statement codes indicate that the annotation was made on the basis of a statement made by the author(s) in the reference cited. The Author Statement evidence codes are:

- Traceable Author Statement (TAS)
- Non-traceable Author Statement (NAS)

### Curator statement evidence codes
Use of the curatorial statement evidence codes indicates an annotation made on the basis of a curatorial judgement that does not fit into one of the other evidence code classifications. The Curatorial Statement codes:

- Inferred by Curator (IC)
- No biological Data available (ND)

### Electronic Annotation evidence code
All of the above evidence codes are assigned by curators. However, GO also uses one evidence code that is assigned by automated methods, without curatorial judgement. The Automatically-Assigned evidence code is

- Inferred from Electronic Annotation (IEA)

In [3]:
solid_codes = ['EXP','IDA','IPI','IMP','IGI','IEP','TAS','NAS']
sketchy_codes = ['HTP','HDA','HMP','HGI','HEP','IC']
bad_codes = ['ISS','ISO','ISA','ISM','IGC','IBA','IBD','IKR','IRD','RCA','IEA','ND']
             
{'solid': len(genes[genes['Evidence'].isin(solid_codes)]), 'sketchy': len(genes[genes['Evidence'].isin(sketchy_codes)]), 'bad': len(genes[genes['Evidence'].isin(bad_codes)])}

{'bad': 36554, 'sketchy': 1744, 'solid': 39181}

#### We'll try with all the codes first, and then try with just the solid codes if we're dissatisfied. 

## II. Find good ontology depth

In [4]:
g = nx.read_gpickle('../../GONN/GO/GO_molecular_function.pickle')
g

<networkx.classes.digraph.DiGraph at 0x10300e7f0>

In [5]:
df = pd.DataFrame.from_dict(dict(g.nodes(data=True))).transpose()
df.head()

Unnamed: 0,depth,name,namespace
GO:0000006,7,high-affinity zinc transmembrane transporter a...,molecular_function
GO:0000007,7,low-affinity zinc ion transmembrane transporte...,molecular_function
GO:0000009,6,"alpha-1,6-mannosyltransferase activity",molecular_function
GO:0000010,4,trans-hexaprenyltranstransferase activity,molecular_function
GO:0000014,5,single-stranded DNA endodeoxyribonuclease acti...,molecular_function


In [6]:
df[df.depth == 0]

Unnamed: 0,depth,name,namespace
GO:0003674,0,molecular_function,molecular_function


In [7]:
df[df.depth == 1]

Unnamed: 0,depth,name,namespace
GO:0003824,1,catalytic activity,molecular_function
GO:0005198,1,structural molecule activity,molecular_function
GO:0005215,1,transporter activity,molecular_function
GO:0005488,1,binding,molecular_function
GO:0016209,1,antioxidant activity,molecular_function
GO:0031386,1,protein tag,molecular_function
GO:0038024,1,cargo receptor activity,molecular_function
GO:0045182,1,translation regulator activity,molecular_function
GO:0045735,1,nutrient reservoir activity,molecular_function
GO:0060089,1,molecular transducer activity,molecular_function


In [8]:
df[df.depth == 2]

Unnamed: 0,depth,name,namespace
GO:0000035,2,acyl binding,molecular_function
GO:0000036,2,acyl carrier activity,molecular_function
GO:0000156,2,phosphorelay response regulator activity,molecular_function
GO:0000988,2,"transcription factor activity, protein binding",molecular_function
GO:0001070,2,RNA-binding transcription regulator activity,molecular_function
GO:0001072,2,"transcription antitermination factor activity,...",molecular_function
GO:0001073,2,"transcription antitermination factor activity,...",molecular_function
GO:0001618,2,virus receptor activity,molecular_function
GO:0001871,2,pattern binding,molecular_function
GO:0003682,2,chromatin binding,molecular_function


#### Depth 1 seems good.

## III. Build a mapping from genes to terms via subterms

#### We need to find a list of terms, and for each term, all subterms. Then we can map all genes to the list of terms

In [9]:
level1_terms = df[df.depth == 1].index.tolist()

In [10]:
terms_and_subterms = {term: np.unique(flatten(list(nx.dfs_successors(g, term).values()))).tolist() for term in level1_terms}

In [11]:
terms = [item for l in [subterms+[term] for term, subterms in list(terms_and_subterms.items())] for item in l]
len(terms), len(np.unique(terms)), len(df)

(11470, 11130, 11131)

In [12]:
terms_and_genes = {term: genes[genes.GO_ID.isin(subterms+[term])][['GeneSymbol', 'Evidence']].values.tolist() for term, subterms in terms_and_subterms.items()}

In [13]:
genes_and_terms = flatten([[(gene, term, evidence) for [gene, evidence] in genes] for term, genes in terms_and_genes.items()])

In [14]:
evidence = pd.DataFrame(genes_and_terms, columns=['gene','GO_ID','Evidence']).groupby(['gene', 'GO_ID'])['Evidence'].apply(list).to_frame()
evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1CF,GO:0005488,"[TAS, IDA, IDA, IPI]"
A2M,GO:0005488,"[IPI, IMP, IPI, IDA, IPI, IPI, IDA, IDA, IPI]"
A2M,GO:0098772,"[IDA, TAS]"
A2ML1,GO:0098772,"[IEA, IDA]"
A3GALT2,GO:0003824,"[IDA, IBA, IEA]"


#### We need to score the evidence for each term for each gene, in cases when a gene maps to two terms

In [15]:
score = {**{type: 3 for type in solid_codes}, **{type: 2 for type in sketchy_codes}, **{type: 1 for type in bad_codes}}
def evidence_list_to_score_list(evidence_list): return [[score[evidence] for evidence in evidence_list]]

In [16]:
evidence_scores = evidence.apply(lambda row: evidence_list_to_score_list(row['Evidence']), axis=1)
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1CF,GO:0005488,"[3, 3, 3, 3]"
A2M,GO:0005488,"[3, 3, 3, 3, 3, 3, 3, 3, 3]"
A2M,GO:0098772,"[3, 3]"
A2ML1,GO:0098772,"[1, 3]"
A3GALT2,GO:0003824,"[3, 1, 1]"


In [17]:
evidence_scores = evidence_scores.apply(lambda row: sum(row['Evidence']), axis=1).to_frame().rename(columns={0:'Evidence'})
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1CF,GO:0005488,12
A2M,GO:0005488,27
A2M,GO:0098772,6
A2ML1,GO:0098772,4
A3GALT2,GO:0003824,5


In [18]:
best_evidence = evidence_scores[evidence_scores['Evidence'] == evidence_scores.groupby(['gene'])['Evidence'].transform(max)]
best_evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1CF,GO:0005488,12
A2M,GO:0005488,27
A2ML1,GO:0098772,4
A3GALT2,GO:0003824,5
A4GALT,GO:0003824,4


In [19]:
gene_to_function_term = best_evidence.reset_index()[['gene', 'GO_ID']].set_index('gene')
gene_to_function_term.head()

Unnamed: 0_level_0,GO_ID
gene,Unnamed: 1_level_1
A1CF,GO:0005488
A2M,GO:0005488
A2ML1,GO:0098772
A3GALT2,GO:0003824
A4GALT,GO:0003824


In [20]:
functions = df[df.depth == 1]['name'].to_frame()
functions

Unnamed: 0,name
GO:0003824,catalytic activity
GO:0005198,structural molecule activity
GO:0005215,transporter activity
GO:0005488,binding
GO:0016209,antioxidant activity
GO:0031386,protein tag
GO:0038024,cargo receptor activity
GO:0045182,translation regulator activity
GO:0045735,nutrient reservoir activity
GO:0060089,molecular transducer activity


In [21]:
functions['name'] = functions['name'].str.replace(' activity','')

In [22]:
molecular_functions = gene_to_function_term.merge(functions, how='left', left_on='GO_ID', right_index=True)[['GO_ID','name']]
molecular_functions.head()

Unnamed: 0_level_0,GO_ID,name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
A1CF,GO:0005488,binding
A2M,GO:0005488,binding
A2ML1,GO:0098772,molecular function regulator
A3GALT2,GO:0003824,catalytic
A4GALT,GO:0003824,catalytic


In [23]:
molecular_functions.to_pickle('molecular_function_gene_annotation.pickle')