In [1]:
import pandas as pd
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

In [2]:
gwas_file = "/data5/deepro/ukbiobank/papers/bmi_project/0_data_download/published_studies/GWAS_Catalog/data/gwas_genes_to_traits.csv"
combinations2_file = "/data5/deepro/ukbiobank/papers/bmi_project/3_run_rarecomb/white_british/data/parsed_tables/combo_2.csv"
combinations3_file = "/data5/deepro/ukbiobank/papers/bmi_project/3_run_rarecomb/white_british/data/parsed_tables/combo_3.csv"
gencode_file = "/data5/deepro/ukbiobank/papers/bmi_project/1_parse_data/prepare_gencode_genes/data/gencode.v39.parsed.genes.csv"

In [3]:
gwas_df = pd.read_csv(gwas_file, low_memory=False)
combinations2_df = pd.read_csv(combinations2_file, low_memory=False)
combinations3_df = pd.read_csv(combinations3_file, low_memory=False)
gencode_df = pd.read_csv(gencode_file).drop_duplicates('gene_id_stripped').set_index('gene_id_stripped', drop=False)

In [4]:
# get genes being studied
genes = pd.read_csv("/data5/deepro/ukbiobank/papers/bmi_project/1_parse_data/annotate_vcf/data/variants_by_gene/lof_missense_pred_freq_0.01_format2.tsv", sep='\t', nrows=0)
genes = list(genes.columns)[1:]
genes = [s.split('_')[1] for s in genes]
genes = list(gencode_df.loc[genes]['gene_name'])

In [5]:
def create_gwas_enrichment_table(combo_dfs, ncombos_list, gwas_df, genes_study, save_file):
    # restrict to genes present both in GWAS and all analyzed in our study
    genes_study = list(set(genes_study).intersection(set(gwas_df.Gene)))
    gwas_df = gwas_df.loc[gwas_df.Gene.isin(genes_study)]
    gwas_genes = set(gwas_df.Gene)
    # get list of unique genes in combos
    unique_combo_genes = set()
    for combo_df, ncombo in zip(combo_dfs, ncombos_list):
        combo_set = set(combo_df.loc[:, [f"Item_{i}_symbol" for i in range(1, ncombo + 1)]].values.flatten())
        unique_combo_genes.update(combo_set)
    # keep only those present in gwas database
    unique_combo_genes = unique_combo_genes.intersection(gwas_genes)
    stats = []
    # only keep phenotypes with at least 10 genes
    trait_counts = gwas_df.Mapped_Trait.value_counts()
    trait_keep = list(trait_counts.loc[trait_counts >= 10].index)
    gwas_df = gwas_df[gwas_df.Mapped_Trait.isin(trait_keep)]
    for trait in gwas_df.Mapped_Trait.unique():
        all_genes_with_trait = set(gwas_df.loc[gwas_df.Mapped_Trait==trait, "Gene"])
        all_genes_without_trait = gwas_genes.difference(all_genes_with_trait)
        combo_genes_with_trait = unique_combo_genes.intersection(all_genes_with_trait)
        combo_genes_without_trait = unique_combo_genes.difference(combo_genes_with_trait)
        noncombo_genes_with_trait = all_genes_with_trait.difference(unique_combo_genes)
        noncombo_genes_without_trait = all_genes_without_trait.difference(unique_combo_genes)

        contingency_table = [[len(combo_genes_with_trait), len(combo_genes_without_trait)],
                            [len(noncombo_genes_with_trait), len(noncombo_genes_without_trait)]]
        
        result = fisher_exact(contingency_table, alternative='greater')
        oddsratio = result[0]
        pvalue = result[1]
        stats.append([trait, oddsratio, pvalue, len(combo_genes_with_trait), len(combo_genes_without_trait), len(noncombo_genes_with_trait), len(noncombo_genes_without_trait)])
    # multiple testing for stats df
    stats = pd.DataFrame(stats, columns=['gwas_phenotype', 'oddsratio', 'pvalue', 'Num_combo_genes_with_phenotype', 'Num_combo_genes_without_phenotype', 'Num_noncombo_genes_with_phenotype', 'Num_noncombo_genes_without_phenotype'])
    stats['FDR'] = multipletests(stats.pvalue, method='fdr_bh')[1]
    # save file
    stats.sort_values("FDR").to_csv(save_file, index=False)
    return stats

In [6]:
save_file = "/data5/deepro/ukbiobank/papers/bmi_project/4_characterization/white_british/data/enrichment/gwas_enrichment.csv"
stats = create_gwas_enrichment_table([combinations2_df, combinations3_df], [2, 3], gwas_df, genes, save_file)

In [7]:
stats.loc[stats.FDR<0.05]

Unnamed: 0,gwas_phenotype,oddsratio,pvalue,Num_combo_genes_with_phenotype,Num_combo_genes_without_phenotype,Num_noncombo_genes_with_phenotype,Num_noncombo_genes_without_phenotype,FDR
3,body height,2.187713,0.0001969847,32,127,1496,12989,0.033226
57,intraocular pressure measurement,3.501546,0.0003584181,12,147,330,14155,0.035692
70,urate measurement,2.803782,4.317373e-05,23,136,824,13661,0.010319
165,antibody measurement,27.836538,0.0003316356,3,156,10,14475,0.035692
167,gut microbiome measurement,4.13573,6.098362e-07,20,139,487,13998,0.000729
286,bone density,3.05905,0.0003040483,15,144,477,14008,0.035692
356,periodontitis,6.330441,2.878884e-05,9,150,136,14349,0.010319
397,executive function measurement,27.836538,0.0003316356,3,156,10,14475,0.035692
769,Epstein-Barr virus infection,10.191276,0.0002224329,5,154,46,14439,0.033226
826,BMI-adjusted waist circumference,3.162185,4.986423e-06,24,135,771,13714,0.002979
