In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
from scipy.stats import kstest,ttest_ind,fisher_exact
from scipy.stats.contingency import odds_ratio
import itertools as it
from collections import Counter
import seaborn as sns
from matplotlib.ticker import MultipleLocator
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams.update({'font.size': 14, 'axes.linewidth': 2, 'xtick.major.width': 1.5, 'xtick.major.size': 7, 'ytick.major.width': 1.5, 'ytick.major.size': 7})
from matplotlib.backends.backend_pdf import PdfPages
from functools import reduce
from scipy.stats import kstest,ttest_ind

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
phenotype_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/british/train_cohort_bmi.csv.gz"
combo_files = [
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo2.csv",
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo3.csv"
]

phenotype_df = pd.read_csv(phenotype_file)


combo_df = pd.concat([pd.read_csv(cf) for cf in combo_files]).reset_index(drop=True)
all_combo_samples = set("|".join(combo_df.combo_samples.values).split("|"))

In [3]:
categorical_cols = ["genetic_sex"]
numerical_cols = ["age"] + [f"genetic_pca{i}" for i in range(1, 40)]
scaled_numerical_cols = []#["bmi_prs"]

def get_scaled_bmi(df, categorical_cols, numerical_cols, scaled_numerical_cols):
    # define encoders
    en = LabelEncoder()
    scaler = StandardScaler()
    # select the categorical and numerical columns
    # transform the categorical columns to integer values
    for cat_col in categorical_cols:
        df[cat_col] = en.fit_transform(df[cat_col])
    # scale the numerical columns
    df[numerical_cols] = scaler.fit_transform(df.loc[:, numerical_cols])
    # scale bmi separately
    df["bmi_scaled"] = scaler.fit_transform(df.loc[:, ["bmi"]])
    # Create the target variable (bmi_residuals) using linear regression
    X = df.loc[:, categorical_cols + numerical_cols + scaled_numerical_cols]
    y = df.loc[:, 'bmi_scaled']
    model = LinearRegression()
    model.fit(X, y)
    # save the residuals for bmi
    df['bmi_residuals'] = y - model.predict(X)
    return df

phenotype_df = get_scaled_bmi(phenotype_df, categorical_cols, numerical_cols, scaled_numerical_cols)
phenotype_df["bmi_res_decile"] = pd.qcut(phenotype_df.bmi_residuals, q=10)
phenotype_df["bmi_res_decile_num"] = pd.qcut(phenotype_df.bmi_residuals, q=10, labels=False)
phenotype_df["bmi_prs_decile"] = pd.qcut(phenotype_df.bmi_prs, q=10)
phenotype_df["bmi_prs_decile_num"] = pd.qcut(phenotype_df.bmi_prs, q=10, labels=False)
phenotype_combo_samples_df = phenotype_df.loc[phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]
phenotype_other_samples_df = phenotype_df.loc[~phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]

phenotype_other_samples_df["description"] = "Non Combo"
phenotype_combo_samples_df["description"] = "Combo"
phenotype_samples_df = pd.concat((phenotype_combo_samples_df, phenotype_other_samples_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_other_samples_df["description"] = "Non Combo"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_combo_samples_df["description"] = "Combo"


In [4]:
def get_table_info_helper(combo_samples, other_combo_samples, top_decile_samples):
    combo_in_top = len(combo_samples.intersection(top_decile_samples))
    combo_not_in_top = len(combo_samples.difference(top_decile_samples))
    other_combo_in_top = len(other_combo_samples.intersection(top_decile_samples))
    other_combo_not_in_top = len(other_combo_samples.difference(top_decile_samples))
    table = [
        [combo_in_top, combo_not_in_top],
        [other_combo_in_top, other_combo_not_in_top]
    ]
    res = fisher_exact(table)
    or_res = odds_ratio(table)
    ci_low, ci_high = or_res.confidence_interval(confidence_level=0.95)
    return combo_in_top, combo_not_in_top, other_combo_in_top, other_combo_not_in_top, or_res.statistic, ci_low, ci_high, res.pvalue

def get_table_info(ser, all_combo_samples, top_decile_samples_bmi, top_decile_samples_bmi_prs):
    combo_samples = set(ser.combo_samples.split("|"))
    other_combo_samples = all_combo_samples.difference(combo_samples)
    combo_in_top_bmi, combo_not_in_top_bmi, other_combo_in_top_bmi, other_combo_not_in_top_bmi, odds_ratio_bmi, ci_low_bmi, ci_high_bmi, pvalue_bmi = get_table_info_helper(combo_samples, other_combo_samples, top_decile_samples_bmi)
    combo_in_top_bmi_prs, combo_not_in_top_bmi_prs, other_combo_in_top_bmi_prs, other_combo_not_in_top_bmi_prs, odds_ratio_bmi_prs, ci_low_bmi_prs, ci_high_bmi_prs, pvalue_bmi_prs = get_table_info_helper(combo_samples, other_combo_samples, top_decile_samples_bmi_prs)
    
    return pd.Series({
        "combo_in_top_bmi": combo_in_top_bmi, "combo_not_in_top_bmi": combo_not_in_top_bmi, 
        "other_combo_in_top_bmi": other_combo_in_top_bmi, "other_combo_not_in_top_bmi": other_combo_not_in_top_bmi,
        "odds_ratio_bmi": odds_ratio_bmi, "ci_low_bmi": ci_low_bmi, "ci_high_bmi": ci_high_bmi, "p_value_bmi": pvalue_bmi,
        "combo_in_top_bmi_prs": combo_in_top_bmi_prs, "combo_not_in_top_bmi_prs": combo_not_in_top_bmi_prs, 
        "other_combo_in_top_bmi_prs": other_combo_in_top_bmi_prs, "other_combo_not_in_top_bmi_prs": other_combo_not_in_top_bmi_prs,
        "odds_ratio_bmi_prs": odds_ratio_bmi_prs, "ci_low_bmi_prs": ci_low_bmi_prs, "ci_high_bmi_prs": ci_high_bmi_prs, "p_value_bmi_prs": pvalue_bmi_prs
        })
    
    

top_decile_samples_bmi = set(phenotype_samples_df.loc[phenotype_samples_df.bmi_res_decile_num==9, "sample_names"].astype(str).values)
top_decile_samples_bmi_prs = set(phenotype_samples_df.loc[phenotype_samples_df.bmi_prs_decile_num==9, "sample_names"].astype(str).values)

In [5]:
combo_info_df = combo_df.merge(
    combo_df.apply(get_table_info, args=(all_combo_samples, top_decile_samples_bmi, top_decile_samples_bmi_prs), axis=1), left_index=True, right_index=True)

In [6]:
combo_info_df.sort_values("p_value_bmi").head(10)

Unnamed: 0,uniq_items,combo_samples,combo_in_top_bmi,combo_not_in_top_bmi,other_combo_in_top_bmi,other_combo_not_in_top_bmi,odds_ratio_bmi,ci_low_bmi,ci_high_bmi,p_value_bmi,combo_in_top_bmi_prs,combo_not_in_top_bmi_prs,other_combo_in_top_bmi_prs,other_combo_not_in_top_bmi_prs,odds_ratio_bmi_prs,ci_low_bmi_prs,ci_high_bmi_prs,p_value_bmi_prs
102,Input_DUOX1|Input_MYH6,1003024|1011876|1062216|1240434|1272627|128030...,19.0,127.0,3180.0,6867.0,0.323092,0.187963,0.526787,3.248036e-07,16.0,130.0,1160.0,8887.0,0.942923,0.521517,1.596984,1.0
199,Input_LRP1B|Input_WWOX,1003321|1055145|1057484|1112246|1200560|121285...,17.0,80.0,3182.0,6914.0,0.46176,0.255952,0.78791,0.002791401,7.0,90.0,1169.0,8927.0,0.59397,0.231612,1.279026,0.203873
395,Input_ITGA6|Input_PDCD11,2105596|2145508|2640171|4944820|5305621,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,1.0,4.0,1175.0,9013.0,1.917503,0.038907,19.398016,0.458318
391,Input_ALAS2|Input_ANKRD16,1485402|1987487|4154752|5314047|5771642,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,2.0,3.0,1174.0,9014.0,5.117281,0.427001,44.71962,0.104929
344,Input_AFM|Input_SHPRH,1523412|1988675|2250580|3538759|5880788,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,1.0,4.0,1175.0,9013.0,1.917503,0.038907,19.398016,0.458318
912,Input_LRP2|Input_PKLR|Input_RYR3,1515431|1625646|1761141|4349008|4594650,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,1.0,4.0,1175.0,9013.0,1.917503,0.038907,19.398016,0.458318
756,Input_COL4A2|Input_MYH13|Input_SACS,2377194|2796902|4300021|4684127|5041821,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,3.0,2.0,1173.0,9015.0,11.522755,1.318553,138.091541,0.012799
766,Input_LRP1B|Input_MYO15A|Input_TENM2,2227744|2669838|4540934|5594465|5733061,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,1.0,4.0,1175.0,9013.0,1.917503,0.038907,19.398016,0.458318
724,Input_CAPN3|Input_COL5A1|Input_NAV2,1611646|2248003|2358834|4333681|5771078,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,0.0,5.0,1176.0,9012.0,0.0,0.0,8.372088,1.0
1150,Input_ABCA7|Input_LRP5|Input_TSC2,1155754|1370315|2343456|3537203|4181518,5.0,0.0,3194.0,6994.0,inf,2.004991,inf,0.003038288,2.0,3.0,1174.0,9014.0,5.117281,0.427001,44.71962,0.104929


In [7]:
combo_info_df.to_csv("/data6/deepro/ukb_bmi/3_characterization/data/pilot/combo_top_decile_enrich.csv", index=False)