In [1]:
import collections
import os

import pandas
import networkx

import utilities

In [2]:
download_dir = os.path.join('..', 'download')
annotation_dir = os.path.join('..', 'annotations')

In [3]:
remove_subsets = {
    'goantislim_grouping', # Grouping classes that can be excluded
    'gocheck_do_not_annotate' # Term not to be used for direct annotation
    'gocheck_do_not_manually_annotate', # Term not to be used for direct manual annotation
}

propagate_along = {'is_a', 'part_of'}

experimental_codes = {
    'EXP', # Inferred from Experiment
    'IDA', # Inferred from Direct Assay
    'IPI', # Inferred from Physical Interaction
    'IMP', # Inferred from Mutant Phenotype
    'IGI', # Inferred from Genetic Interaction
    'IEP', # Inferred from Expression Pattern
}

## Read Gene Ontology graph

In [4]:
graph = utilities.read_go(download_dir)
print(networkx.info(graph))

Name: go
Type: MultiDiGraph
Number of nodes: 42716
Number of edges: 89596
Average in degree:   2.0975
Average out degree:   2.0975


In [5]:
# dataframe of GO terms
go_df = utilities.graph_to_dataframe(graph)
go_df.head(2)

  go_df = go_df.sort('go_id')


Unnamed: 0,go_id,go_name,go_domain
30585,GO:0000001,mitochondrion inheritance,biological_process
42542,GO:0000002,mitochondrial genome maintenance,biological_process


In [6]:
# Remove nodes that should not be annotated
remove_nodes = set()
for node, data in graph.nodes(data=True):
    if remove_subsets & set(data.get('subset', [])):
        remove_nodes.add(node)
        #graph.remove_node(node)

# Remove edges that should not be propagated along
remove_edges = []
for u, v, key in graph.edges(data=False, keys=True):
    if key not in propagate_along:
        remove_edges.append((u, v, key))

for u, v, key in remove_edges:
    graph.remove_edge(u, v, key)

assert networkx.is_directed_acyclic_graph(graph)
print(networkx.info(graph))

Name: go
Type: MultiDiGraph
Number of nodes: 42716
Number of edges: 81079
Average in degree:   1.8981
Average out degree:   1.8981


## Read Entrez Gene and annotations

In [7]:
# Read Entrez Gene info
gene_df = utilities.read_gene_info(download_dir)
gene_df = gene_df[['GeneID', 'Symbol', 'type_of_gene', 'tax_id']]
gene_df.head(2)

Unnamed: 0,GeneID,Symbol,type_of_gene,tax_id
0,5692769,NEWENTRY,other,7
1,1246500,repA1,protein-coding,9


In [8]:
# Read annotations
goa_df = utilities.read_gene2go(download_dir)
goa_df.head(2)

Unnamed: 0,tax_id,GeneID,GO_ID,Evidence,Qualifier,GO_term,PubMed,Category
0,3702,814629,GO:0003676,IEA,,nucleic acid binding,,Function
1,3702,814629,GO:0005634,ISM,,nucleus,,Component


## Add and propagate annotations

In [9]:
def annotate_graph(graph, goa_df):
    """Add direct annotations to graph"""
    graph = graph.copy()
    
    # Add dictionary items for storing annotations
    for node, data in graph.nodes.items():
        for key in 'direct_annotations', 'direct_not_annotations', 'inferred_annotations':
            data[key] = set()

    # Populate direct annotations
    for i, row in goa_df.iterrows():

        go_id = row['GO_ID']
        if go_id not in graph:
            continue

        key = 'direct_not_annotations' if utilities.is_NOT_qaulifier(row.Qualifier) else 'direct_annotations'

        gene = row['GeneID']
        graph.node[go_id][key].add(gene)
    
    return graph

In [10]:
def propagate_annotations(graph):
    """Infer annotations through propagations"""
    for node in networkx.topological_sort(graph):
        data = graph.node[node]
        inferred = data['inferred_annotations']
        inferred -= data['direct_not_annotations']
        inferred |= data['direct_annotations']
        for subsuming_node in graph.successors(node):
            subsuming_data = graph.node[subsuming_node]
            subsuming_data['inferred_annotations'] |= inferred

In [11]:
joiner = lambda x: '|'.join(map(str, x))

def extract_annotation_df(graph):
    """Create an annotation dataframe"""
    rows = list()
    for node, data in graph.nodes.items():
        if node in remove_nodes:
            continue
        for kind in 'direct', 'inferred':
            for gene in data['{}_annotations'.format(kind)]:
                rows.append((node, kind, gene))
    
    annotation_df = pandas.DataFrame(rows, columns=['go_id', 'kind', 'GeneID'])
    annotation_df = annotation_df.merge(gene_df)

    rows = list()
    for (tax_id, kind), taxon_df in annotation_df.groupby(['tax_id', 'kind']):
        for go_id, term_df in taxon_df.groupby('go_id'):
            term_df = term_df.sort_values('GeneID')
            row = tax_id, go_id, kind, len(term_df), joiner(term_df['GeneID']), joiner(term_df['Symbol'])
            rows.append(row)
    wide_df = pandas.DataFrame(rows, columns = ['tax_id', 'go_id', 'annotation_type', 'size', 'gene_ids', 'gene_symbols'])
    wide_df = go_df.merge(wide_df)
    return wide_df

## Extract and save annotations

In [12]:
for ev_type in 'allev', 'expev':
    goa_subset_df = goa_df
    if ev_type == 'expev':
        goa_subset_df = goa_subset_df[goa_subset_df.Evidence.isin(experimental_codes)]
    graph_annot = annotate_graph(graph, goa_subset_df)
    propagate_annotations(graph_annot)
    annotation_df = extract_annotation_df(graph_annot)

    for (tax_id, annotation_type), df in annotation_df.groupby(['tax_id', 'annotation_type']):
        path = utilities.get_annotation_path(annotation_dir, tax_id, annotation_type, ev_type, mkdir=True)
        print(path)
        df.to_csv(path, sep='\t', index=False)

../annotations/taxid_3702/GO_annotations-3702-direct-allev.tsv
../annotations/taxid_3702/GO_annotations-3702-inferred-allev.tsv
../annotations/taxid_4528/GO_annotations-4528-direct-allev.tsv
../annotations/taxid_4528/GO_annotations-4528-inferred-allev.tsv
../annotations/taxid_4529/GO_annotations-4529-direct-allev.tsv
../annotations/taxid_4529/GO_annotations-4529-inferred-allev.tsv
../annotations/taxid_4532/GO_annotations-4532-direct-allev.tsv
../annotations/taxid_4532/GO_annotations-4532-inferred-allev.tsv
../annotations/taxid_4535/GO_annotations-4535-direct-allev.tsv
../annotations/taxid_4535/GO_annotations-4535-inferred-allev.tsv
../annotations/taxid_4536/GO_annotations-4536-direct-allev.tsv
../annotations/taxid_4536/GO_annotations-4536-inferred-allev.tsv
../annotations/taxid_4537/GO_annotations-4537-direct-allev.tsv
../annotations/taxid_4537/GO_annotations-4537-inferred-allev.tsv
../annotations/taxid_4538/GO_annotations-4538-direct-allev.tsv
../annotations/taxid_4538/GO_annotations-