# Gene ontology analysis

In [1]:
import pandas as pd

## Download ontologies

Download ontologies, a dictionary that maps GO IDs to GO terms. In most cases, we should use the basic OBO file.

In [2]:
from goatools.base import download_go_basic_obo
from goatools.obo_parser import GODag

obo_fname = download_go_basic_obo()
ontologies = GODag(obo_fname)

  EXISTS: go-basic.obo
go-basic.obo: fmt(1.2) rel(2019-07-01) 47,413 GO Terms


In [3]:
first = list(ontologies.keys())[0]
first, ontologies[first]

('GO:0000001', GOTerm('GO:0000001'):
   id:GO:0000001
   item_id:GO:0000001
   name:mitochondrion inheritance
   namespace:biological_process
   _parents: 2 items
     GO:0048308
     GO:0048311
   parents: 2 items
     GO:0048308	level-04	depth-04	organelle inheritance [biological_process]
     GO:0048311	level-05	depth-05	mitochondrion distribution [biological_process]
   children: 0 items
   level:5
   depth:6
   is_obsolete:False
   alt_ids: 0 items)

## Download associations

Download associations, a dictionary that maps each gene ID to a set of GOs.
We can use either the associations from NCBI or from GeneOntology.

In [None]:
# from goatools.base import download_ncbi_associations
# from goatools.anno.genetogo_reader import Gene2GoReader

# # Read NCBI's gene2go. Store annotations in a list of namedtuples
# file_gene2go = download_ncbi_associations()
# taxids = [7955] # zebrafish
# objanno = Gene2GoReader(file_gene2go, taxids=taxids)

# associations = objanno.get_ns2assc()
# for nspc, id2gos in associations.items():
#     print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))

In [5]:
from goatools.associations import read_gaf
from goatools.base import dnld_gaf

# see http://current.geneontology.org/products/pages/downloads.html
species = 'zfin' # choices are 'goa_human', 'mgi' etc
NS = 'BP' # choices are 'BP', 'CC' or 'MF'

gaf_filename = dnld_gaf(species)
associations = read_gaf(gaf_filename, namespace=NS, go2geneids=False)
print("{N:,} annotated genes".format(N=len(associations)))

HMS:0:00:03.447289 227,647 annotations READ: C:\Users\joewa\Work\git\WebOmics\web_omics\notebooks\gene_ontology\zfin.gaf 
17785 IDs in loaded association branch, BP
17,785 annotated genes


In [6]:
first = list(associations.keys())[0]
first, associations[first]

('ZDB-MIRNAG-081210-6', {'GO:0030182', 'GO:0035195'})

## Load background genes

In [15]:
def gaf_symbol_to_id(gaf_filename):
    df = pd.read_csv(gaf_filename, comment='!', sep='\t', header=None)
    
    # temp has 2 columns. First is the gene id, next is the gene symbol
    # example:
    # 'ZDB-MIRNAG-081210-6', 'mir26b'
    temp = df.iloc[:, 1:3].values 
    symbol_to_id = {symbol: my_id for my_id, symbol in temp}
    return symbol_to_id

In [16]:
def load_background_symbols(symbol_filename):
    df = pd.read_csv(symbol_filename, header=None)
    background_symbols = df.values.flatten()
    return background_symbols

In [17]:
def to_id(symbols):
    ids = []
    for x in symbols:
        try:
            my_id = symbol_to_id[x.lower()]
            ids.append(my_id)
        except KeyError as e:
            # print(e)
            pass
    return ids

In [18]:
symbol_to_id = gaf_symbol_to_id('zfin.gaf')

In [24]:
# here we use all genes in the study as the background, see https://www.biostars.org/p/17628/
background_symbols = load_background_symbols('zebrafish.txt')
background_ids = to_id(background_symbols)
len(background_ids)

8242

#### Initialise GOEA object

In [28]:
from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
        background_ids,
        associations,
        ontologies, 
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method


Load GOEA Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 89%  7,295 of  8,242 population items found in association


In [29]:
geneids_study = background_ids[0:400]
geneids_study

['ZDB-GENE-060824-3',
 'ZDB-GENE-030912-4',
 'ZDB-GENE-040426-903',
 'ZDB-GENE-130530-713',
 'E7F5G8',
 'ZDB-GENE-081105-101',
 'ZDB-GENE-091118-25',
 'ZDB-GENE-040329-1',
 'ZDB-GENE-991019-6',
 'ZDB-GENE-050913-36',
 'ZDB-GENE-061220-8',
 'ZDB-GENE-031006-4',
 'ZDB-GENE-030131-9790',
 'ZDB-GENE-031006-12',
 'ZDB-GENE-050517-1',
 'ZDB-GENE-050517-2',
 'ZDB-GENE-050517-3',
 'ZDB-GENE-050517-4',
 'ZDB-GENE-050517-5',
 'ZDB-GENE-040525-2',
 'ZDB-GENE-050517-14',
 'ZDB-GENE-080204-52',
 'ZDB-GENE-050517-9',
 'ZDB-GENE-070912-584',
 'ZDB-GENE-050517-10',
 'ZDB-GENE-050410-6',
 'ZDB-GENE-050517-12',
 'ZDB-GENE-050517-15',
 'ZDB-GENE-050517-25',
 'ZDB-GENE-040426-1523',
 'ZDB-GENE-050517-16',
 'ZDB-GENE-030616-511',
 'ZDB-GENE-050517-17',
 'ZDB-GENE-050517-22',
 'E7F108',
 'ZDB-GENE-050517-23',
 'ZDB-GENE-050517-27',
 'ZDB-GENE-050517-28',
 'ZDB-GENE-040426-2868',
 'ZDB-GENE-050517-29',
 'ZDB-GENE-050517-30',
 'ZDB-GENE-050517-31',
 'ZDB-GENE-070424-84',
 'ZDB-GENE-050517-35',
 'ZDB-GENE-0505

#### Load actual study genes from the Zebrafish paper

In [33]:
df = pd.read_pickle('C:\\Users\\joewa\\Work\\git\\WebOmics\\web_omics\\notebooks\\gene_ontology\\selection_df.p')
df.head()

Unnamed: 0_level_0,obs,gene_pk,US_1584693,US_1584700,US_1584706,US_1584712,US_1584722,US_1584724,US_1584725,US_1584732,...,US_1584753,US_1584754,US_1584758,US_1584765,padj_Distal_vs_Middle,FC_Distal_vs_Middle,significant_all,significant_any,padj_Proximal_vs_Middle,FC_Proximal_vs_Middle
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Myh10,True,ENSDARG00000000103,2214,1245,1307,849,674,2277,1345,2316,...,1313,738,2332,780,8.019571e-18,0.567173,True,True,2.561093e-35,-0.780711
Me3,True,ENSDARG00000002305,930,1020,1061,744,666,913,1468,978,...,1076,684,914,848,1.472293e-05,0.370251,True,True,0.01263625,0.246983
Ak4,True,ENSDARG00000006546,1409,842,990,578,616,1222,946,1177,...,823,688,1182,593,0.003571892,0.285288,True,True,1.167698e-06,-0.465605
Fgb,True,ENSDARG00000008969,0,0,0,1,0,0,0,0,...,0,0,2,0,,-0.800992,False,False,,-1.182585
Vim,True,ENSDARG00000010008,832,240,304,166,149,667,319,572,...,298,202,803,225,0.007563848,0.393427,True,True,1.705657e-24,-1.310299


In [34]:
genesymbols_study = df.index.values
geneids_study = to_id(genesymbols_study)
len(geneids_study)

48

#### Run GO Analysis

In [35]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
goea_results_all = goeaobj.run_study(geneids_study)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]


Run GOEA Gene Ontology Analysis: current study set of 48 IDs ...
 88%     42 of     48 study items found in association
100%     48 of     48 study items found in population(8242)
Calculating 4,226 uncorrected p-values using fisher_scipy_stats
   4,226 GO terms are associated with  7,230 of  8,242 population items
     179 GO terms are associated with     42 of     48 study items
  METHOD fdr_bh:
      15 GO terms found significant (< 0.05=alpha) ( 15 enriched +   0 purified): statsmodels fdr_bh
       8 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)


In [36]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
plot_results("zebrafish_{NS}.png", goea_results_sig)

   15 usr 132 GOs  WROTE: zebrafish_BP.png


In [37]:
goea_results_sig

[GOEnrichmentRecord(GO:0006559),
 GOEnrichmentRecord(GO:0010873),
 GOEnrichmentRecord(GO:0046889),
 GOEnrichmentRecord(GO:0034380),
 GOEnrichmentRecord(GO:0032374),
 GOEnrichmentRecord(GO:0009072),
 GOEnrichmentRecord(GO:0043691),
 GOEnrichmentRecord(GO:0033700),
 GOEnrichmentRecord(GO:0042157),
 GOEnrichmentRecord(GO:0019433),
 GOEnrichmentRecord(GO:0034372),
 GOEnrichmentRecord(GO:0070328),
 GOEnrichmentRecord(GO:0033344),
 GOEnrichmentRecord(GO:0006695),
 GOEnrichmentRecord(GO:0006869)]

In [50]:
goea_results_sig[0]

GOEnrichmentRecord(GO:0006559)

In [None]:
prt_txt(prt, nts, prtfmt)

#### Try using GOEnrichmentStudyNS

In [40]:
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [41]:
goeaobj = GOEnrichmentStudyNS(
        background_ids,
        associations,
        ontologies, 
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method


Load A0A0A0MPF8 Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact


AttributeError: 'set' object has no attribute 'items'

In [45]:
def get_goeaobj_nbt3102(method='fdr_bh'):
    """Return GOEA Object ready to run Nature data."""
    from goatools.obo_parser import GODag
    from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus
    from goatools.base import download_go_basic_obo, download_ncbi_associations
    from goatools.anno.genetogo_reader import Gene2GoReader
    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    # Load Ontologies
    obo_fname = download_go_basic_obo()
    obodag = GODag("go-basic.obo")
    # Load Associations
    download_ncbi_associations() # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader("gene2go", taxids=[10090])
    # Get associations for each branch of the GO DAG (BP, MF, CC)
    ns2assoc = objanno.get_ns2assc()
    # GOE Object holds Ontologies, Associations, and Background gene set
    return GOEnrichmentStudyNS(
        GeneID2nt_mus.keys(), # Background gene set: mouse protein-coding genes
        ns2assoc, # geneid/GO Associations for BP, MF, anc CC GODAG branches
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = [method]) # defult multipletest correction method

In [46]:
goea = get_goeaobj_nbt3102()

  EXISTS: go-basic.obo
go-basic.obo: fmt(1.2) rel(2019-07-01) 47,413 GO Terms
FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz


Traceback (most recent call last):
  File "c:\users\joewa\.virtualenvs\web_omics-makc_z5x\lib\site-packages\goatools\base.py", line 220, in dnld_file
    rsp = http_get(src_ftp, dst_wget) if src_ftp[:4] == 'http' else ftp_get(src_ftp, dst_wget)
  File "c:\users\joewa\.virtualenvs\web_omics-makc_z5x\lib\site-packages\goatools\base.py", line 203, in ftp_get
    ftp.retrbinary(cmd, open(fout, 'wb').write)  #           /usr/home/gene2go.gz
  File "C:\Users\joewa\AppData\Local\Programs\Python\Python37\Lib\ftplib.py", line 442, in retrbinary
    with self.transfercmd(cmd, rest) as conn:
  File "C:\Users\joewa\AppData\Local\Programs\Python\Python37\Lib\ftplib.py", line 399, in transfercmd
    return self.ntransfercmd(cmd, rest)[0]
  File "C:\Users\joewa\AppData\Local\Programs\Python\Python37\Lib\ftplib.py", line 361, in ntransfercmd
    source_address=self.source_address)
  File "C:\Users\joewa\AppData\Local\Programs\Python\Python37\Lib\socket.py", line 727, in create_connection
    raise err

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
