In [1]:
import pandas as pd
import numpy as np
from collections import Counter

from scipy.stats import binom
from statsmodels.stats.multitest import multipletests

### Parse H37Rv GO MF data

In [4]:
h37rv_data = pd.read_csv("input/H37Rv_pantherGeneList.txt", sep="\t", header=None)
h37rv_data.columns = ["id", "id1", "gene_name", "panther_family", "panther_subclass", "panther_MF", "GO_MF"]
h37rv_data = h37rv_data[["id", "gene_name", "panther_MF", "GO_MF"]].fillna("")

# extract the locus ids
h37rv_data["gene_id"] = [x.split("|")[1].split("=")[-1] for x in h37rv_data.id]

# split the GO MF's into parseable lists
h37rv_data["GO_MF_split"] = [x.split(";") for x in h37rv_data.GO_MF]

def get_unique_GO_MFs(data):
    all_unique_GO_MF = []
    for x in data.GO_MF_split:
        all_unique_GO_MF += x
    all_unique_GO_MF = list(set(all_unique_GO_MF))
    return all_unique_GO_MF

all_unique_GO_MF = get_unique_GO_MFs(h37rv_data)


def count_GO_MFs(all_unique_GO_MF, data):
    # make a dictionary where each GO MF points to the number of occurences of that GO MF in the dataset
    N_per_GO_MF = {}
    for go_mf in all_unique_GO_MF:
        count = 0
        for x in data.GO_MF_split:
            if go_mf in x:
                count+=1
        N_per_GO_MF[go_mf] = count
    return N_per_GO_MF

len(all_unique_GO_MF)

1163

### GO analysis of "after-antibiotic" hits

In [5]:
hits = pd.read_csv("output/05.antibiotic/antibiotic_hits.csv", index_col=0)

gene_identifiers = pd.read_csv("output/03.annotation/snps_with_all_annotation.csv")[["pos","gene_id"]]
hits = hits.merge(gene_identifiers, left_on="position_i", right_on="pos", how='left')

all_hits = hits

In [6]:
# Get each GO MF in the test set

unique_hit_table = all_hits[["gene_id", "position_i"]]

all_genes = list(unique_hit_table.gene_id) 
print(len(all_genes))
all_genes = pd.DataFrame(all_genes)
all_genes.columns = ["gene_id"]

all_genes= all_genes.merge(h37rv_data[["gene_id", "GO_MF_split"]], how="left")

# Removes genes with no GO mapping
all_genes = all_genes.dropna()

# Count the occurences in the test set
all_unique_GO_MF_test = get_unique_GO_MFs(all_genes)
N_per_GO_MF_test = count_GO_MFs(all_unique_GO_MF_test, all_genes)

# Occurences in dataset
N_per_GO_MF_database = count_GO_MFs(all_unique_GO_MF, h37rv_data)

# Binomial n, p, k -> N = number of draws, p = probability of success, k = number of draws with that GO category
# p = # of genes with that GO category / total number of genes
# N = test set size
# K = N_per_GO_MF_test[# of genes with that GO category]

# Dataframe of GO_MF, N, p, k, binomial cdf

binomial_df = pd.DataFrame()
binomial_df["GO_MF"] = all_unique_GO_MF_test
binomial_df["k"] = [N_per_GO_MF_test[x] for x in binomial_df.GO_MF]
binomial_df["N"] = len(all_genes)
binomial_df["n_per_database"] = [N_per_GO_MF_database[x] for x in binomial_df.GO_MF]
total_number_of_genes = len(h37rv_data)
binomial_df["p"] = binomial_df.n_per_database/total_number_of_genes

binomial_df["binomial_probability"] = [1-binom.cdf(x, n, p) for x, n, p in zip(binomial_df.k, binomial_df.N, binomial_df.p)]

# Note that we're only testing for enrichment, not depletion here

binomial_df["binomial_fdr"] = multipletests(binomial_df.binomial_probability, alpha=0.01, method="fdr_bh")[1]
binomial_df["binomial_is_sig"] = multipletests(binomial_df.binomial_probability, alpha=0.01, method="fdr_bh")[0]
binomial_df.sort_values("binomial_probability").head(10)


1003


Unnamed: 0,GO_MF,k,N,n_per_database,p,binomial_probability,binomial_fdr,binomial_is_sig
258,fatty acid biosynthetic process(GO:0006633),54,785,73,0.018287,1.110223e-16,9.695948e-15,True
253,DNA-templated transcription(GO:0006351),23,785,5,0.001253,1.110223e-16,9.695948e-15,True
140,response to host immune response(GO:0052572),76,785,106,0.026553,1.110223e-16,9.695948e-15,True
125,D-alanine biosynthetic process(GO:0030632),10,785,1,0.000251,3.330669e-16,1.745271e-14,True
133,alanine metabolic process(GO:0006522),10,785,1,0.000251,3.330669e-16,1.745271e-14,True
77,response to antibiotic(GO:0046677),48,785,73,0.018287,3.042011e-13,1.328345e-11,True
95,ADP biosynthetic process(GO:0006172),8,785,1,0.000251,9.714451e-13,2.827985e-11,True
138,regulation of DNA replication(GO:0006275),8,785,1,0.000251,9.714451e-13,2.827985e-11,True
85,fatty-acyl-CoA biosynthetic process(GO:0046949),8,785,1,0.000251,9.714451e-13,2.827985e-11,True
112,NAD catabolic process(GO:0019677),7,785,1,0.000251,4.500977e-11,1.072051e-09,True


## Analysis of non-antibiotic, non-antigen kits

In [10]:
all_hits_sequential = pd.read_csv("output/04A.pair_annotation/results_significant_annotated.csv", index_col = 0)
all_simultaneous = pd.read_csv("output/04B.pair_annotation/results_significant_annotated.csv", index_col = 0)

sequential_merged = all_hits_sequential.merge(all_simultaneous[["position_i", "position_j", "pval_beta_i_on_j", 'pval_beta_i_on_j_BH_sig']], 
                                                on=["position_i", "position_j"], how="left", suffixes=["_sequential", "_simultaneous"])

simultaneous_merged = all_simultaneous.merge(all_hits_sequential[["position_i", "position_j", "pval_beta_i_on_j", 'pval_beta_i_on_j_BH_sig']], 
                                                on=["position_i", "position_j"], how="left", suffixes=[ "_simultaneous", "_sequential"])

all_hits = pd.concat([simultaneous_merged, sequential_merged])
all_hits = all_hits.drop_duplicates(subset=["position_i", "position_j"]).reset_index(drop=True)

gene_identifiers = pd.read_csv("output/03.annotation/snps_with_all_annotation.csv")[["pos","gene_id"]]
all_hits = all_hits.merge(gene_identifiers, left_on="position_i", right_on="pos", how='left')



In [11]:
# Get each GO MF in the test set

unique_hit_table = all_hits[["gene_id", "position_i"]]

all_genes = list(unique_hit_table.gene_id) 
print(len(all_genes))
all_genes = pd.DataFrame(all_genes)
all_genes.columns = ["gene_id"]

all_genes= all_genes.merge(h37rv_data[["gene_id", "GO_MF_split"]], how="left")

# Removes genes with no GO mapping
all_genes = all_genes.dropna()

# Count the occurences in the test set
all_unique_GO_MF_test = get_unique_GO_MFs(all_genes)
N_per_GO_MF_test = count_GO_MFs(all_unique_GO_MF_test, all_genes)

# Occurences in dataset
N_per_GO_MF_database = count_GO_MFs(all_unique_GO_MF, h37rv_data)

# Binomial n, p, k -> N = number of draws, p = probability of success, k = number of draws with that GO category
# p = # of genes with that GO category / total number of genes
# N = test set size
# K = N_per_GO_MF_test[# of genes with that GO category]

# Dataframe of GO_MF, N, p, k, binomial cdf

binomial_df = pd.DataFrame()
binomial_df["GO_MF"] = all_unique_GO_MF_test
binomial_df["k"] = [N_per_GO_MF_test[x] for x in binomial_df.GO_MF]
binomial_df["N"] = len(all_genes)
binomial_df["n_per_database"] = [N_per_GO_MF_database[x] for x in binomial_df.GO_MF]
total_number_of_genes = len(h37rv_data)
binomial_df["p"] = binomial_df.n_per_database/total_number_of_genes

binomial_df["binomial_probability"] = [1-binom.cdf(x, n, p) for x, n, p in zip(binomial_df.k, binomial_df.N, binomial_df.p)]

# Note that we're only testing for enrichment, not depletion here

binomial_df["binomial_fdr"] = multipletests(binomial_df.binomial_probability, alpha=0.01, method="fdr_bh")[1]
binomial_df["binomial_is_sig"] = multipletests(binomial_df.binomial_probability, alpha=0.01, method="fdr_bh")[0]
binomial_df.sort_values("binomial_probability").head(10)


88830


Unnamed: 0,GO_MF,k,N,n_per_database,p,binomial_probability,binomial_fdr,binomial_is_sig
553,cellular oxidant detoxification(GO:0098869),1026,67120,28,0.007014,1.110223e-16,9.1285e-16,True
552,DNA-templated transcription(GO:0006351),4195,67120,5,0.001253,1.110223e-16,9.1285e-16,True
549,dephosphorylation(GO:0016311),602,67120,8,0.002004,1.110223e-16,9.1285e-16,True
545,amino acid activation for nonribosomal peptide...,134,67120,3,0.000752,1.110223e-16,9.1285e-16,True
197,positive regulation of growth(GO:0045927),931,67120,28,0.007014,1.110223e-16,9.1285e-16,True
199,DNA topological change(GO:0006265),1278,67120,3,0.000752,1.110223e-16,9.1285e-16,True
540,RNA phosphodiester bond hydrolysis(GO:0090501),3526,67120,52,0.013026,1.110223e-16,9.1285e-16,True
187,heterophilic cell-cell adhesion via plasma mem...,256,67120,2,0.000501,1.110223e-16,9.1285e-16,True
532,nucleic acid phosphodiester bond hydrolysis(GO...,6371,67120,115,0.028808,1.110223e-16,9.1285e-16,True
526,glucan biosynthetic process(GO:0009250),143,67120,3,0.000752,1.110223e-16,9.1285e-16,True
