# Gene Ontology (GO) Enrichment Analysis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import  hypergeom
from statsmodels.stats.multitest import multipletests

### 0. Generate a parsed GO base file

In [None]:
# Parser is pretty simple, open the OBO file to understand the structure
# but note that I am assuming that every GO term has the same structure
temp_go = {"id":[], "name":[], "namespace":[], "def":[]}
obo_df = pd.DataFrame(temp_go)

file = open('go-basic.obo')
for row in file:
    splt = row.split(": ")
    if splt[0] == "id":
        temp_go["id"] = [splt[1][:-1]]
    if splt[0] == "name":
        temp_go["name"] = [splt[1][:-1]]
    if splt[0] == "namespace":
        temp_go["namespace"] = [splt[1][:-1]]
    if splt[0] == "def":
        temp_go["def"] = [splt[1][:-1].split('"')[1]]
        obo_df = pd.concat([obo_df, pd.DataFrame(temp_go)], axis=0)
        temp_go = {"id":[], "name":[], "namespace":[], "def":[]}
obo_df.to_csv("go-basic-parsed.csv")

### 1. Load the go-basic-parsed.csv file

In [1]:
# https://github.com/cmungall/obo
# import obo - tried using this, got an error, made my own parser...
obo_df = pd.read_csv("go-basic-parsed.csv")
# remove the wrong column, need to adjust the to_csv next time
obo_df.drop(obo_df.columns[0],axis=1)

Unnamed: 0,id,name,namespace,def
0,GO:0000001,mitochondrion inheritance,biological_process,"The distribution of mitochondria, including th..."
1,GO:0000002,mitochondrial genome maintenance,biological_process,The maintenance of the structure and integrity...
2,GO:0000003,reproduction,biological_process,The production of new individuals that contain...
3,GO:0000005,obsolete ribosomal chaperone activity,molecular_function,OBSOLETE. Assists in the correct assembly of r...
4,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,Enables the transfer of zinc ions (Zn2+) from ...
...,...,...,...,...
47412,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,The chemical reactions and pathways involving ...
47413,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,The chemical reactions and pathways resulting ...
47414,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,The chemical reactions and pathways resulting ...
47415,GO:2001316,kojic acid metabolic process,biological_process,The chemical reactions and pathways involving ...


### 2. Create a genome specific GO term dictionary 
Note: this works with Phytozome annotation files that have all GO terms for each gene in a single cell

In [3]:
gene_go_terms = pd.read_csv("GOs_agpv4.tsv", sep="\t")
gene_go_terms.head()

Unnamed: 0,genes,term,Unnamed: 2
0,Zm00001d000001,"GO:0004097,GO:0004503,GO:0046872,GO:0052716,GO...",
1,Zm00001d000002,"GO:0004373,GO:0009011,GO:2001070,GO:0005739,GO...",
2,Zm00001d000003,"GO:0005488,GO:0044237",
3,Zm00001d000004,"GO:0005730,GO:0005737,GO:0044463,GO:0008017,GO...",
4,Zm00001d000005,"GO:0007155,GO:0009913,GO:0010090,GO:0044237,GO...",


In [6]:
go_dict = {}
for row in range(len(gene_go_terms)):
    for go_term in gene_go_terms.iloc[row][1].split(","):
        if go_term not in go_dict.keys():
            go_dict[go_term] = [gene_go_terms.iloc[row][0]]
        else:
            go_dict[go_term].append(gene_go_terms.iloc[row][0])

# Hyper-geometric test in python: 

https://gist.github.com/fbrundu/cfa675c1d79b4ade4724
* M = Total number of genes
* n = Number of genes having GO term
* N = Number of DEGs
* k = Number of DEGs having GO term

#### To calculate the hypergeometric p value:
* hpd = hypergeom(M, n, N)
* p = hpd.pmf(k)

### 3. Prepare the DEG gene lists

In [22]:
M = len(gene_go_terms) # Total number of genes taken from GO annotation

# Import the list of gene IDs from the supplementary table
degs_all   = pd.read_csv("DEG_list.csv")
degs = degs_all.copy()
degs = degs[degs["category"]=="treatment1"]

N = len(degs) # Number of DEGs
print("M:", M, "N:", N)

M: 39323 N: 275


### 4. Start the GO term enrichment analysis using hypergeom

In [23]:
# go_dict is a dictionary of GO term names and the values are list of associated genes
# n is how many genes are in the GO term
# k is a how many genes in the DEG list are associated with specific GO term
n_list = []
k_list = []
go_list = []
pval_list = []
gene_list = []
for key in go_dict.keys():
    n = len(go_dict[key])
    k = len(degs[degs["gene"].isin(go_dict[key])])
    hpd = hypergeom(M, n, N)
    p = hpd.pmf(k)
    n_list.append(n)
    k_list.append(k)
    go_list.append(key)
    pval_list.append(p)
    gene_list.append(",".join(list(degs[degs["gene"].isin(go_dict[key])]["gene"])))

### 5. Prepare the GO term enrichment analysis results in a dataframe

In [24]:
# Create a dataframe for the hypergeometric test results
df_hg = pd.DataFrame({"id": go_list, "pval": pval_list, 
                      "n": n_list, "k": k_list, 
                      "M": [M] * len(n_list), "N": [N] * len(n_list),
                      "genes": gene_list})
# Calculate the adjusted p-values using the FDR BH method
fdr_results = multipletests(pval_list, method="fdr_bh")
df_hg["adj"] = fdr_results[1]
# Filter non-significant rows based on default alpha < 0.05
df_hg = df_hg[fdr_results[0]]

### 6. Calculate the the GO term fold-enrichment 

In [25]:
# https://stackoverflow.com/questions/23414161/pandas-isin-with-output-keeping-order-of-input-list
# descriptions = mapman.drop_duplicates("BINCODE")
# descriptions = descriptions[descriptions["BINCODE"].isin(df_hg["BINCODE"])]
df_hg = df_hg.merge(obo_df, left_on='id', right_on='id')
df_hg["FC_enrichment"] = (df_hg["k"] / df_hg["n"]) / (N / M)

### 7. Write the GO term enrichment results to a CSV file

In [26]:
# Can't remember if the commented line below is necessary
# df_hg = df_hg.drop("Unnamed: 0", axis=1)
df_hg.to_csv("treatment1_go.tsv", sep="\t")