# Gene ontology analysis

In [1]:
import pandas as pd

## Download ontologies

Download ontologies, a dictionary that maps GO IDs to GO terms. In most cases, we should use the basic OBO file.

In [2]:
from goatools.base import download_go_basic_obo
from goatools.obo_parser import GODag

obo_fname = download_go_basic_obo()
ontologies = GODag(obo_fname)

  EXISTS: go-basic.obo
go-basic.obo: fmt(1.2) rel(2019-02-14) 47,395 GO Terms


In [16]:
first = list(ontologies.keys())[0]
first, ontologies[first]

('GO:0000001', GOTerm('GO:0000001'):
   id:GO:0000001
   item_id:GO:0000001
   name:mitochondrion inheritance
   namespace:biological_process
   _parents: 2 items
     GO:0048311
     GO:0048308
   parents: 2 items
     GO:0048308	level-04	depth-04	organelle inheritance [biological_process]
     GO:0048311	level-05	depth-05	mitochondrion distribution [biological_process]
   children: 0 items
   level:5
   depth:6
   is_obsolete:False
   alt_ids: 0 items)

## Download associations

Download associations, a dictionary that maps each gene ID to a set of GOs.
We can use either the associations from NCBI or from GeneOntology.

In [6]:
from goatools.base import download_ncbi_associations
from goatools.anno.genetogo_reader import Gene2GoReader

# Read NCBI's gene2go. Store annotations in a list of namedtuples
file_gene2go = download_ncbi_associations()
taxids = [7955] # zebrafish
objanno = Gene2GoReader(file_gene2go, taxids=taxids)

associations = objanno.get_ns2assc()
for nspc, id2gos in associations.items():
    print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))

  EXISTS: gene2go
HMS:0:00:02.476439 157,752 annotations READ: gene2go 
1 taxids stored: 7955
BP 15,051 annotated genes
MF 15,218 annotated genes
CC 15,050 annotated genes


In [12]:
from goatools.associations import read_gaf
from goatools.base import dnld_gaf

# see http://current.geneontology.org/products/pages/downloads.html
species = 'zfin' # choices are 'goa_human', 'mgi' etc
NS = 'BP' # choices are 'BP', 'CC' or 'MF'

gaf_filename = dnld_gaf(species_list)
associations = read_gaf(gaf_filename, namespace=NS, go2geneids=False)
print("{N:,} annotated genes".format(N=len(associations)))

HMS:0:00:03.118535 227,647 annotations READ: C:\Users\joewa\Work\git\WebOmics\web_omics\notebooks\gene_ontology\zfin.gaf 
17785 IDs in association branch, BP
17,785 annotated genes


In [15]:
first = list(associations.keys())[0]
first, associations[first]

('ZDB-MIRNAG-081210-6', {'GO:0030182', 'GO:0035195'})

## Load background genes

In [17]:
def gaf_symbol_to_id(gaf_filename):
    df = pd.read_csv('zfin.gaf', comment='!', sep='\t', header=None)
    
    # temp has 2 columns. First is the gene id, next is the gene symbol
    # example:
    # 'ZDB-MIRNAG-081210-6', 'mir26b'
    temp = df.iloc[:, 1:3].values 
    symbol_to_id = {symbol: my_id for my_id, symbol in temp}
    return symbol_to_id

In [18]:
def load_background_symbols(symbol_filename):
    df = pd.read_csv(symbol_filename, header=None)
    background_symbols = df.values.flatten()
    return background_symbols

In [19]:
def to_id(background_symbols):
    background_ids = []
    for x in background_symbols:
        try:
            background_id = symbol_to_id[x.lower()]
            background_ids.append(background_id)
        except KeyError as e:
            # print(e)
            pass
    return background_ids

In [20]:
# here we use all genes in the study as the background, see https://www.biostars.org/p/17628/
symbol_to_id = gaf_symbol_to_id(gaf_filename)
background_symbols = load_background_symbols('zebrafish.txt')
background_ids = to_id(background_symbols)

  if (await self.run_code(code, result,  async_=asy)):


In [21]:
len(background_ids)

8242

#### Initialise GOEA object

In [None]:
from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
        background_ids,
        associations,
        ontologies, 
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method

In [None]:
geneids_study = background_ids[0:400]

In [None]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
goea_results_all = goeaobj.run_study(geneids_study)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

In [None]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
plot_results("zebrafish_{NS}.png", goea_results_sig)