# Creating a Biological Process Reference for OmicsIntegrator2

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict

def flatten(list_of_lists): return [item for sublist in list_of_lists for item in sublist]

import mygene

In [2]:
genes = pd.read_csv('../../GONN/GO/biological_process.csv')
genes.head()

Unnamed: 0,GeneSymbol,GO_ID,GO_term,Evidence
0,A1BG,GO:0002576,platelet degranulation,TAS
1,A1BG,GO:0008150,biological_process,ND
2,A1BG,GO:0043312,neutrophil degranulation,TAS
3,A2M,GO:0001869,"negative regulation of complement activation, ...",IDA
4,A2M,GO:0002576,platelet degranulation,TAS


## I. Evidence Codes

Copied from http://geneontology.org/page/guide-go-evidence-codes

### Experimental Evidence codes
Use of an experimental evidence code in a GO annotation indicates that the cited paper displayed results from a physical characterization of a gene or gene product that has supported the association of a GO term. The Experimental Evidence codes are:

- Inferred from Experiment (EXP)
- Inferred from Direct Assay (IDA)
- Inferred from Physical Interaction (IPI)
- Inferred from Mutant Phenotype (IMP)
- Inferred from Genetic Interaction (IGI)
- Inferred from Expression Pattern (IEP)

### High Throughput (HTP) evidence codes
High throughput (HTP) evidence codes may be used to make annotations based upon high throughput methodologies. Use of HTP evidence codes should be carefully considered and follow the GOC's guidelines for their use. The High Throughput Evidence Codes are:

- Inferred from High Throughput Experiment (HTP)
- Inferred from High Throughput Direct Assay (HDA)
- Inferred from Hight Throughput Mutant Phenotype (HMP)
- Inferred from High Throughput Genetic Interaction (HGI)
- Inferred from High Throughput Expression Pattern (HEP)

### Computational Analysis evidence codes
Use of the computational analysis evidence codes indicates that the annotation is based on an in silico analysis of the gene sequence and/or other data as described in the cited reference. The evidence codes in this category also indicate a varying degree of curatorial input. The Computational Analysis evidence codes are:

- Inferred from Sequence or structural Similarity (ISS)
- Inferred from Sequence Orthology (ISO)
- Inferred from Sequence Alignment (ISA)
- Inferred from Sequence Model (ISM)
- Inferred from Genomic Context (IGC)
- Inferred from Biological aspect of Ancestor (IBA)
- Inferred from Biological aspect of Descendant (IBD)
- Inferred from Key Residues (IKR)
- Inferred from Rapid Divergence (IRD)
- Inferred from Reviewed Computational Analysis (RCA)

### Author statement evidence codes
Author statement codes indicate that the annotation was made on the basis of a statement made by the author(s) in the reference cited. The Author Statement evidence codes are:

- Traceable Author Statement (TAS)
- Non-traceable Author Statement (NAS)

### Curator statement evidence codes
Use of the curatorial statement evidence codes indicates an annotation made on the basis of a curatorial judgement that does not fit into one of the other evidence code classifications. The Curatorial Statement codes:

- Inferred by Curator (IC)
- No biological Data available (ND)

### Electronic Annotation evidence code
All of the above evidence codes are assigned by curators. However, GO also uses one evidence code that is assigned by automated methods, without curatorial judgement. The Automatically-Assigned evidence code is

- Inferred from Electronic Annotation (IEA)

In [3]:
solid_codes = ['EXP','IDA','IPI','IMP','IGI','IEP','TAS','NAS']
sketchy_codes = ['HTP','HDA','HMP','HGI','HEP','IC']
bad_codes = ['ISS','ISO','ISA','ISM','IGC','IBA','IBD','IKR','IRD','RCA','IEA','ND']
             
{'solid': len(genes[genes['Evidence'].isin(solid_codes)]), 'sketchy': len(genes[genes['Evidence'].isin(sketchy_codes)]), 'bad': len(genes[genes['Evidence'].isin(bad_codes)])}

{'bad': 82024, 'sketchy': 1103, 'solid': 66059}

## II. Find good ontology depth

In [4]:
g = nx.read_gpickle('../../GONN/GO/GO_biological_process.pickle')
g

<networkx.classes.digraph.DiGraph at 0x10fd22cf8>

In [5]:
df = pd.DataFrame.from_dict(dict(g.nodes(data=True))).transpose()
df.head()

Unnamed: 0,depth,name,namespace
GO:0000001,5,mitochondrion inheritance,biological_process
GO:0000002,5,mitochondrial genome maintenance,biological_process
GO:0000003,1,reproduction,biological_process
GO:0000011,5,vacuole inheritance,biological_process
GO:0000012,6,single strand break repair,biological_process


In [6]:
df[df.depth == 0]

Unnamed: 0,depth,name,namespace
GO:0008150,0,biological_process,biological_process


In [7]:
df[df.depth == 1]

Unnamed: 0,depth,name,namespace
GO:0000003,1,reproduction,biological_process
GO:0001906,1,cell killing,biological_process
GO:0002376,1,immune system process,biological_process
GO:0006791,1,sulfur utilization,biological_process
GO:0006794,1,phosphorus utilization,biological_process
GO:0007610,1,behavior,biological_process
GO:0008152,1,metabolic process,biological_process
GO:0008283,1,cell proliferation,biological_process
GO:0009758,1,carbohydrate utilization,biological_process
GO:0009987,1,cellular process,biological_process


In [8]:
df[df.depth == 2]

Unnamed: 0,depth,name,namespace
GO:0000075,2,cell cycle checkpoint,biological_process
GO:0000728,2,"gene conversion at mating-type locus, DNA doub...",biological_process
GO:0000734,2,"gene conversion at mating-type locus, DNA repa...",biological_process
GO:0000742,2,karyogamy involved in conjugation with cellula...,biological_process
GO:0000743,2,nuclear migration involved in conjugation with...,biological_process
GO:0000920,2,cell separation after cytokinesis,biological_process
GO:0001502,2,cartilage condensation,biological_process
GO:0001503,2,ossification,biological_process
GO:0001545,2,primary ovarian follicle growth,biological_process
GO:0001546,2,preantral ovarian follicle growth,biological_process


#### Depth 1 seems good.

## III. Build a mapping from genes to terms via subterms

#### We need to find a list of terms, and for each term, all subterms. Then we can map all genes to the list of terms

In [9]:
level1_terms = df[df.depth == 1].index.tolist()

In [10]:
terms_and_subterms = {term: np.unique(flatten(list(nx.dfs_successors(g, term).values()))).tolist() for term in level1_terms}

In [11]:
terms = [item for l in [subterms+[term] for term, subterms in list(terms_and_subterms.items())] for item in l]
len(terms), len(np.unique(terms)), len(df)

(57113, 29617, 29618)

In [12]:
terms_and_genes = {term: genes[genes.GO_ID.isin(subterms+[term])][['GeneSymbol', 'Evidence']].values.tolist() for term, subterms in terms_and_subterms.items()}

In [13]:
genes_and_terms = flatten([[(gene, term, evidence) for [gene, evidence] in genes] for term, genes in terms_and_genes.items()])

In [14]:
evidence = pd.DataFrame(genes_and_terms, columns=['gene','GO_ID','Evidence']).groupby(['gene', 'GO_ID'])['Evidence'].apply(list).to_frame()
evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0002376,[TAS]
A1BG,GO:0009987,"[TAS, TAS]"
A1BG,GO:0050896,[TAS]
A1BG,GO:0051179,"[TAS, TAS]"
A1CF,GO:0008152,"[IEA, TAS, IEA]"


#### We need to score the evidence for each term for each gene, in cases when a gene maps to two terms

In [15]:
score = {**{type: 3 for type in solid_codes}, **{type: 2 for type in sketchy_codes}, **{type: 1 for type in bad_codes}}
def evidence_list_to_score_list(evidence_list): return [[score[evidence] for evidence in evidence_list]]

In [16]:
evidence_scores = evidence.apply(lambda row: evidence_list_to_score_list(row['Evidence']), axis=1)
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0002376,[3]
A1BG,GO:0009987,"[3, 3]"
A1BG,GO:0050896,[3]
A1BG,GO:0051179,"[3, 3]"
A1CF,GO:0008152,"[1, 3, 1]"


In [17]:
evidence_scores = evidence_scores.apply(lambda row: sum(row['Evidence']), axis=1).to_frame().rename(columns={0:'Evidence'})
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0002376,3
A1BG,GO:0009987,6
A1BG,GO:0050896,3
A1BG,GO:0051179,6
A1CF,GO:0008152,5


In [18]:
best_evidence = evidence_scores[evidence_scores['Evidence'] == evidence_scores.groupby(['gene'])['Evidence'].transform(max)]
best_evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0009987,6
A1BG,GO:0051179,6
A1CF,GO:0008152,5
A1CF,GO:0009987,5
A2M,GO:0065007,11


#### Although we don't see them here, we need to deal with ties

In [19]:
len(best_evidence), len(best_evidence.reset_index().drop_duplicates('gene'))

(27418, 18306)

In [20]:
best_evidence = best_evidence.reset_index().drop_duplicates('gene').set_index(['gene', 'GO_ID'])
best_evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0009987,6
A1CF,GO:0008152,5
A2M,GO:0065007,11
A2ML1,GO:0065007,4
A3GALT2,GO:0008152,3


In [21]:
gene_to_process_term = best_evidence.reset_index()[['gene', 'GO_ID']].set_index('gene')
gene_to_process_term.head()

Unnamed: 0_level_0,GO_ID
gene,Unnamed: 1_level_1
A1BG,GO:0009987
A1CF,GO:0008152
A2M,GO:0065007
A2ML1,GO:0065007
A3GALT2,GO:0008152


In [22]:
processes = df[df.depth == 1]['name'].to_frame()
processes

Unnamed: 0,name
GO:0000003,reproduction
GO:0001906,cell killing
GO:0002376,immune system process
GO:0006791,sulfur utilization
GO:0006794,phosphorus utilization
GO:0007610,behavior
GO:0008152,metabolic process
GO:0008283,cell proliferation
GO:0009758,carbohydrate utilization
GO:0009987,cellular process


In [23]:
biological_processes = gene_to_process_term.merge(processes, how='left', left_on='GO_ID', right_index=True)[['GO_ID','name']]
biological_processes.head()

Unnamed: 0_level_0,GO_ID,name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,GO:0009987,cellular process
A1CF,GO:0008152,metabolic process
A2M,GO:0065007,biological regulation
A2ML1,GO:0065007,biological regulation
A3GALT2,GO:0008152,metabolic process


## IV. Add "Specific Process" information to each gene

In [24]:
level2_terms = df[df.depth == 2].index.tolist()

In [25]:
terms_and_subterms = {term: np.unique(flatten(list(nx.dfs_successors(g, term).values()))).tolist() for term in level2_terms}

In [26]:
terms = [item for l in [subterms+[term] for term, subterms in list(terms_and_subterms.items())] for item in l]
len(terms), len(np.unique(terms)), len(df)

(84432, 29589, 29618)

In [27]:
terms_and_genes = {term: genes[genes.GO_ID.isin(subterms+[term])][['GeneSymbol', 'Evidence']].values.tolist() for term, subterms in terms_and_subterms.items()}

In [28]:
genes_and_terms = flatten([[(gene, term, evidence) for [gene, evidence] in genes] for term, genes in terms_and_genes.items()])

In [29]:
evidence = pd.DataFrame(genes_and_terms, columns=['gene','GO_ID','Evidence']).groupby(['gene', 'GO_ID'])['Evidence'].apply(list).to_frame()
evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0001775,[TAS]
A1BG,GO:0002252,[TAS]
A1BG,GO:0006955,[TAS]
A1BG,GO:0032940,"[TAS, TAS]"
A1BG,GO:0045321,[TAS]


#### We've made a committment for each gene to belong to a single level1 term, so let's remove all the level2 terms which aren't subterms of the previously selected level1 term for each gene

In [30]:
predecessors = {term: list(g.predecessors(term)) for term in level2_terms}

In [31]:
predecessors = {term: [parent for parent in parents if parent in level1_terms] for term, parents in predecessors.items()}

In [32]:
predecessors = {term: parents[0] for term, parents in predecessors.items()}

In [33]:
predecessors = pd.Series(predecessors).rename_axis('level2_term').rename('level1_term').to_frame()
predecessors.head()

Unnamed: 0_level_0,level1_term
level2_term,Unnamed: 1_level_1
GO:0000075,GO:0009987
GO:0000728,GO:0022414
GO:0000734,GO:0022414
GO:0000742,GO:0022414
GO:0000743,GO:0022414


In [34]:
evidence = evidence.reset_index().merge(predecessors, how='left', left_on='GO_ID', right_index=True).set_index(['gene', 'level1_term', 'GO_ID'])
evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Evidence
gene,level1_term,GO_ID,Unnamed: 3_level_1
A1BG,GO:0009987,GO:0001775,[TAS]
A1BG,GO:0002376,GO:0002252,[TAS]
A1BG,GO:0050896,GO:0006955,[TAS]
A1BG,GO:0009987,GO:0032940,"[TAS, TAS]"
A1BG,GO:0002376,GO:0045321,[TAS]


In [35]:
evidence = evidence.reset_index().merge(biological_processes['GO_ID'].rename('chosen_level1').to_frame(), how='left', left_on='gene', right_index=True)
evidence = evidence[evidence.level1_term == evidence.chosen_level1]
evidence = evidence.set_index(['gene','GO_ID'])['Evidence'].to_frame()
evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0001775,[TAS]
A1BG,GO:0032940,"[TAS, TAS]"
A1CF,GO:0006807,"[IEA, TAS, IEA]"
A1CF,GO:0044237,"[IEA, TAS, IEA]"
A1CF,GO:0044238,"[IEA, TAS, IEA]"


In [36]:
evidence_scores = evidence.apply(lambda row: evidence_list_to_score_list(row['Evidence']), axis=1)
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0001775,[3]
A1BG,GO:0032940,"[3, 3]"
A1CF,GO:0006807,"[1, 3, 1]"
A1CF,GO:0044237,"[1, 3, 1]"
A1CF,GO:0044238,"[1, 3, 1]"


In [37]:
evidence_scores = evidence_scores.apply(lambda row: sum(row['Evidence']), axis=1).to_frame().rename(columns={0:'Evidence'})
evidence_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0001775,3
A1BG,GO:0032940,6
A1CF,GO:0006807,5
A1CF,GO:0044237,5
A1CF,GO:0044238,5


In [38]:
best_evidence = evidence_scores[evidence_scores['Evidence'] == evidence_scores.groupby(['gene'])['Evidence'].transform(max)]
best_evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0032940,6
A1CF,GO:0006807,5
A1CF,GO:0044237,5
A1CF,GO:0044238,5
A1CF,GO:0071704,5


In [39]:
len(best_evidence), len(best_evidence.reset_index().drop_duplicates('gene'))

(33255, 18153)

In [40]:
best_evidence = best_evidence.reset_index().drop_duplicates('gene').set_index(['gene', 'GO_ID'])
best_evidence.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evidence
gene,GO_ID,Unnamed: 2_level_1
A1BG,GO:0032940,6
A1CF,GO:0006807,5
A2M,GO:0050789,7
A2ML1,GO:0050789,4
A3GALT2,GO:0044238,3


In [41]:
gene_to_process_term = best_evidence.reset_index()[['gene', 'GO_ID']].set_index('gene')
gene_to_process_term.head()

Unnamed: 0_level_0,GO_ID
gene,Unnamed: 1_level_1
A1BG,GO:0032940
A1CF,GO:0006807
A2M,GO:0050789
A2ML1,GO:0050789
A3GALT2,GO:0044238


In [42]:
processes = df[df.depth == 2]['name'].to_frame()
processes

Unnamed: 0,name
GO:0000075,cell cycle checkpoint
GO:0000728,"gene conversion at mating-type locus, DNA doub..."
GO:0000734,"gene conversion at mating-type locus, DNA repa..."
GO:0000742,karyogamy involved in conjugation with cellula...
GO:0000743,nuclear migration involved in conjugation with...
GO:0000920,cell separation after cytokinesis
GO:0001502,cartilage condensation
GO:0001503,ossification
GO:0001545,primary ovarian follicle growth
GO:0001546,preantral ovarian follicle growth


In [43]:
specific_biological_processes = gene_to_process_term.merge(processes, how='left', left_on='GO_ID', right_index=True)[['GO_ID','name']]
specific_biological_processes.head()

Unnamed: 0_level_0,GO_ID,name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,GO:0032940,secretion by cell
A1CF,GO:0006807,nitrogen compound metabolic process
A2M,GO:0050789,regulation of biological process
A2ML1,GO:0050789,regulation of biological process
A3GALT2,GO:0044238,primary metabolic process


In [44]:
biological_processes = biological_processes.rename(columns={'GO_ID':'general_process_GO_ID', 'name':'general_process'})
biological_processes.head()

Unnamed: 0_level_0,general_process_GO_ID,general_process
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,GO:0009987,cellular process
A1CF,GO:0008152,metabolic process
A2M,GO:0065007,biological regulation
A2ML1,GO:0065007,biological regulation
A3GALT2,GO:0008152,metabolic process


In [45]:
biological_processes = biological_processes.merge(specific_biological_processes, how='left', left_index=True, right_index=True).rename(columns={'GO_ID':'specific_process_GO_ID', 'name':'specific_process'})
biological_processes.head()

Unnamed: 0_level_0,general_process_GO_ID,general_process,specific_process_GO_ID,specific_process
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1BG,GO:0009987,cellular process,GO:0032940,secretion by cell
A1CF,GO:0008152,metabolic process,GO:0006807,nitrogen compound metabolic process
A2M,GO:0065007,biological regulation,GO:0050789,regulation of biological process
A2ML1,GO:0065007,biological regulation,GO:0050789,regulation of biological process
A3GALT2,GO:0008152,metabolic process,GO:0044238,primary metabolic process


In [46]:
biological_processes.to_pickle('biological_processes_gene_annotation.pickle')