In [1]:
import pandas as pd
import numpy as np
from scipy.stats.contingency import odds_ratio
import forestplot as fp
from scipy.stats import fisher_exact

import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.sans-serif'] = "Arial" # missing fonts:: https://alexanderlabwhoi.github.io/post/2021-03-missingfont/
# Then, "ALWAYS use sans-serif fonts"
matplotlib.rcParams['font.family'] = "sans-serif"
matplotlib.rcParams.update({'font.size': 14, 'axes.linewidth': 2, 'xtick.major.width': 1.5, 'xtick.major.size': 7, 'ytick.major.width': 1.5, 'ytick.major.size': 7})
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
def save_pdf(save_file, fig):
    pdf = PdfPages(save_file)
    pdf.savefig(fig, bbox_inches='tight',dpi=300)
    pdf.close()
    return

In [3]:
case_controls_file = "/data6/deepro/ukb_bmi/1_data_processing/data/british/case_controls.csv"

study_genes_list_files = [
    "/data6/deepro/ukb_bmi/0_data_preparation_and_download/bmi_genes/akbari_2021/data/akbari_genes.list",
    "/data6/deepro/ukb_bmi/0_data_preparation_and_download/bmi_genes/turcot_2018/data/turcot_genes.list"
    ]

combo_files = [
    "/data6/deepro/ukb_bmi/2_rarecomb/data/british/combo2.csv",
    "/data6/deepro/ukb_bmi/2_rarecomb/data/british/combo3.csv"
]

genotype_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/genotype/data/processed_burden/all_gene_burden.csv.gz"

In [4]:
case_cont_df = pd.read_csv(case_controls_file)

In [5]:
case_samples = set(case_cont_df.loc[case_cont_df.Output_BMI==1, "Sample_Name"].astype("str").values)
control_samples = set(case_cont_df.loc[case_cont_df.Output_BMI==0, "Sample_Name"].astype("str").values)

In [6]:
def get_gene_set(gene_file):
    with open(gene_file, "r") as f:
        genes = set([g.strip() for g in f.readlines()])
    return genes

In [7]:
study_genes = [get_gene_set(gf) for gf in study_genes_list_files]
combo_dfs = [pd.read_csv(cf) for cf in combo_files]

In [8]:
genotype_df = pd.read_csv(genotype_file)

In [38]:
def create_contingency_tables_combos(combo_row, case_samples, control_samples):
    combo_case_samples = combo_row.Case_Samples
    combo_control_samples = combo_row.Control_Samples
    if pd.isnull(combo_control_samples):
        combo_control_samples = ""
    all_combo_samples = set(combo_case_samples.split("|")).union(set(combo_control_samples.split("|")))
    all_combo_case_samples = case_samples.intersection(all_combo_samples)
    all_noncombo_case_samples = case_samples.difference(all_combo_samples)
    all_combo_cont_samples = control_samples.intersection(all_combo_samples)
    all_noncombo_cont_samples = control_samples.difference(all_combo_samples)
    contingency_table = np.array([[len(all_combo_case_samples), len(all_noncombo_case_samples)], [len(all_combo_cont_samples), len(all_noncombo_cont_samples)]])
    return odds_ratio(contingency_table).statistic

In [47]:
combo_df = pd.concat([cdf.loc[:, ["uniq_items", "Case_Samples", "Control_Samples", "Effect_Size", "Case_Adj_Pval_BH"]] for cdf in combo_dfs])

In [48]:
odd_df = combo_df.apply(create_contingency_tables_combos, args=(case_samples, control_samples), axis=1)

In [52]:
combo_df.sort_values(["Effect_Size", "Case_Adj_Pval_BH"], ascending=False).head(20)

Unnamed: 0,uniq_items,Case_Samples,Control_Samples,Effect_Size,Case_Adj_Pval_BH
362,Input_DNAH5|Input_ST3GAL5,1343718|2264340|3544655|1224348|1000867|442603...,,0.022041,0.048798
178,Input_FAHD1|Input_SLC45A4,1807381|2803018|5337806|5121356|4020475|177600...,,0.022041,0.038261
278,Input_ABCB6|Input_LIPE,5313055|1596708|3448008|2043246|3106915|108336...,,0.022041,0.015488
237,Input_ADAMTSL3|Input_HFM1,4137274|1708301|2040349|2459979|1814897|238745...,,0.022041,0.006728
111,Input_ANK2|Input_FTCD,1543135|3608051|5980076|3422681|1158222|481438...,1739240|1411388,0.021515,0.044444
392,Input_GLIS1|Input_NPHP4,5980939|3939865|3479599|5839266|5263044|411891...,,0.02091,0.025076
381,Input_ATP2B2|Input_PEX5,3761613|3444798|4179106|5225080|5629177|314501...,,0.02091,0.016121
17,Input_SPINK5|Input_UGT1A9,3491522|1194757|1374898|2100292|2647505|517728...,,0.02091,0.00361
72,Input_ALPI|Input_INCA1,1526472|5855850|2488466|5778990|5378349|169597...,,0.02091,0.002064
363,Input_MTMR3|Input_TENM2,5231591|5855850|4563958|4461527|4998327|264334...,5868031|1429251|3497814|2615124|3240132|4607929,0.020192,0.027898
