In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
from scipy.stats import kstest,ttest_ind,fisher_exact
from scipy.stats.contingency import odds_ratio
import itertools as it
from collections import Counter
import seaborn as sns
from matplotlib.ticker import MultipleLocator
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams.update({'font.size': 14, 'axes.linewidth': 2, 'xtick.major.width': 1.5, 'xtick.major.size': 7, 'ytick.major.width': 1.5, 'ytick.major.size': 7})
from matplotlib.backends.backend_pdf import PdfPages
from functools import reduce
from scipy.stats import kstest,ttest_ind

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
phenotype_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/british/train_cohort_bmi.csv.gz"
combo_files = [
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo2.csv",
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo3.csv"
]

phenotype_df = pd.read_csv(phenotype_file)


combo_df = pd.concat([pd.read_csv(cf) for cf in combo_files]).reset_index(drop=True)
all_combo_samples = set("|".join(combo_df.combo_samples.values).split("|"))

In [3]:
categorical_cols = ["genetic_sex"]
numerical_cols = ["age"] + [f"genetic_pca{i}" for i in range(1, 40)]
scaled_numerical_cols = []#["bmi_prs"]

def get_scaled_bmi(df, categorical_cols, numerical_cols, scaled_numerical_cols):
    # define encoders
    en = LabelEncoder()
    scaler = StandardScaler()
    # select the categorical and numerical columns
    # transform the categorical columns to integer values
    for cat_col in categorical_cols:
        df[cat_col] = en.fit_transform(df[cat_col])
    # scale the numerical columns
    df[numerical_cols] = scaler.fit_transform(df.loc[:, numerical_cols])
    # scale bmi separately
    df["bmi_scaled"] = scaler.fit_transform(df.loc[:, ["bmi"]])
    # Create the target variable (bmi_residuals) using linear regression
    X = df.loc[:, categorical_cols + numerical_cols + scaled_numerical_cols]
    y = df.loc[:, 'bmi_scaled']
    model = LinearRegression()
    model.fit(X, y)
    # save the residuals for bmi
    df['bmi_residuals'] = y - model.predict(X)
    return df

phenotype_df = get_scaled_bmi(phenotype_df, categorical_cols, numerical_cols, scaled_numerical_cols)
phenotype_df["bmi_res_decile"] = pd.qcut(phenotype_df.bmi_residuals, q=10)
phenotype_df["bmi_res_decile_num"] = pd.qcut(phenotype_df.bmi_residuals, q=10, labels=False)
phenotype_df["bmi_prs_decile"] = pd.qcut(phenotype_df.bmi_prs, q=10)
phenotype_df["bmi_prs_decile_num"] = pd.qcut(phenotype_df.bmi_prs, q=10, labels=False)
phenotype_combo_samples_df = phenotype_df.loc[phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]
phenotype_other_samples_df = phenotype_df.loc[~phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]

phenotype_other_samples_df["description"] = "Non Combo"
phenotype_combo_samples_df["description"] = "Combo"
phenotype_samples_df = pd.concat((phenotype_combo_samples_df, phenotype_other_samples_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_other_samples_df["description"] = "Non Combo"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_combo_samples_df["description"] = "Combo"


In [4]:
def get_table_info_helper(combo_samples, other_combo_samples, top_decile_samples):
    combo_in_bottom = len(combo_samples.intersection(top_decile_samples))
    combo_not_in_bottom = len(combo_samples.difference(top_decile_samples))
    other_combo_in_bottom = len(other_combo_samples.intersection(top_decile_samples))
    other_combo_not_in_bottom = len(other_combo_samples.difference(top_decile_samples))
    table = [
        [combo_in_bottom, combo_not_in_bottom],
        [other_combo_in_bottom, other_combo_not_in_bottom]
    ]
    res = fisher_exact(table, alternative="greater")
    or_res = odds_ratio(table)
    ci_low, ci_high = or_res.confidence_interval(confidence_level=0.95)
    return combo_in_bottom, combo_not_in_bottom, other_combo_in_bottom, other_combo_not_in_bottom, or_res.statistic, ci_low, ci_high, res.pvalue

def get_table_info(ser, all_combo_samples, top_decile_samples_bmi_prs):
    combo_samples = set(ser.combo_samples.split("|"))
    other_combo_samples = all_combo_samples.difference(combo_samples)
    combo_in_bottom_bmi_prs, combo_not_in_bottom_bmi_prs, other_combo_in_bottom_bmi_prs, other_combo_not_in_bottom_bmi_prs, odds_ratio_bmi_prs, ci_low_bmi_prs, ci_high_bmi_prs, pvalue_bmi_prs = get_table_info_helper(combo_samples, other_combo_samples, top_decile_samples_bmi_prs)
    
    return pd.Series({
        "combo_in_bottom_bmi_prs": combo_in_bottom_bmi_prs, "combo_not_in_bottom_bmi_prs": combo_not_in_bottom_bmi_prs, 
        "other_combo_in_bottom_bmi_prs": other_combo_in_bottom_bmi_prs, "other_combo_not_in_bottom_bmi_prs": other_combo_not_in_bottom_bmi_prs,
        "odds_ratio_bmi_prs": odds_ratio_bmi_prs, "ci_low_bmi_prs": ci_low_bmi_prs, "ci_high_bmi_prs": ci_high_bmi_prs, "p_value_bmi_prs": pvalue_bmi_prs
        })
    
    

top_decile_samples_bmi_prs = set(phenotype_samples_df.loc[phenotype_samples_df.bmi_prs_decile_num==0, "sample_names"].astype(str).values)

In [5]:
combo_info_df = combo_df.merge(
    combo_df.apply(get_table_info, args=(all_combo_samples, top_decile_samples_bmi_prs), axis=1), left_index=True, right_index=True)

In [6]:
combo_df

Unnamed: 0,uniq_items,combo_samples
0,Input_BMPR1B|Input_SHC2,1096941|1121194|1997309|2079898|2120943|217223...
1,Input_BCHE|Input_TRPV4,1133669|1470103|2289357|2326174|2424337|249952...
2,Input_ABCA13|Input_DDX60L,1182163|1253768|1355755|1397710|1786412|216534...
3,Input_MYH14|Input_NR1D1,1010013|1062480|1156553|1363786|1544603|156336...
4,Input_ADAM19|Input_MMUT,1793868|1803901|2119938|2191681|2228888|235212...
...,...,...
1829,Input_CPT1B|Input_DRG1|Input_SFI1,1220595|2797301|3109180|3593971|4158131|432321...
1830,Input_ACAP3|Input_SLC7A8|Input_TAS1R3,1332204|1545778|3196670|4231707|5749390
1831,Input_F5|Input_NBEAL2|Input_SPINK8,2828581|4227055|4691233|4840853|5142560|579302...
1832,Input_GHDC|Input_KRTAP2-3|Input_TTN,1730047|3401377|4128798|4978416|5099625|523873...


In [7]:
combo_info_df.loc[combo_info_df.p_value_bmi_prs<0.05]

Unnamed: 0,uniq_items,combo_samples,combo_in_bottom_bmi_prs,combo_not_in_bottom_bmi_prs,other_combo_in_bottom_bmi_prs,other_combo_not_in_bottom_bmi_prs,odds_ratio_bmi_prs,ci_low_bmi_prs,ci_high_bmi_prs,p_value_bmi_prs
75,Input_STXBP2|Input_ZNF717,1033805|1323192|1332714|1348131|1350493|149763...,5.0,16.0,935.0,9237.0,3.086704,0.882346,8.843332,0.038573
99,Input_GADL1|Input_OXSM,1321129|2135478|2639332|3470286|4394271|481134...,3.0,6.0,937.0,9247.0,4.932969,0.796984,23.142457,0.043022
159,Input_CORO2A|Input_MRPL33,1143528|2613822|2979647|3263066|4051105|436977...,3.0,5.0,937.0,9248.0,5.919913,0.917882,30.481841,0.030749
210,Input_PTK2B|Input_SERPINB9,1626067|1710242|1783489|1911329|2030868|215671...,5.0,10.0,935.0,9243.0,4.941381,1.322384,15.906932,0.009033
228,Input_CARHSP1|Input_PCDHGB2,1040061|2086078|2649988|3366173|3564787|392993...,3.0,6.0,937.0,9247.0,4.932969,0.796984,23.142457,0.043022
282,Input_FOXK2|Input_SLC13A5,1399323|1898759|1947496|3558159|4372866|540985...,3.0,4.0,937.0,9249.0,7.400226,1.082398,43.813663,0.020611
285,Input_SDCBP|Input_SLC11A1,1054068|1373832|1515570|1913687|2317411|285504...,6.0,7.0,934.0,9246.0,8.48148,2.349606,29.541251,0.000588
365,Input_ACO1|Input_HELZ,1009734|1029239|1701543|1798881|2161390|217721...,6.0,22.0,934.0,9231.0,2.695058,0.891695,6.875965,0.039163
384,Input_APLP2|Input_DOCK1,1138665|1515906|3920972|4198066|4224877|505062...,3.0,6.0,937.0,9247.0,4.932969,0.796984,23.142457,0.043022
390,Input_FGD5|Input_VILL,1006065|1624648|1716132|2589209|2695754|361952...,3.0,6.0,937.0,9247.0,4.932969,0.796984,23.142457,0.043022


In [8]:
combo_info_df.to_csv("/data6/deepro/ukb_bmi/3_characterization/data/pilot/combo_bottom_prs_decile_enrich.csv", index=False)