In [None]:
pip install statsmodels

In [None]:
import pandas as pd
import numpy as np

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

matplotlib.rcParams['pdf.fonttype'] = 42

import os
import dxpy

In [None]:
# Input and output files
BMI_DATA="/path/to/BMI/and/lifestyle/factors/data.tsx.gz" # Data is from https://github.com/deeprob/BMI_monogenic. Relevant notebooks are in phenotypic_data_processing/
PHEWAS_COV="/path/to/PheWAS/covariate/data.csv" # Use output of script 3_Data preparation/UKB/7_gather_phewas.ipynb
PHEWAS_GENO="/path/to/PheWAS/genotype/data.csv" # Use output of script 3_Data preparation/UKB/7_gather_phewas.ipynb

OUTPUT_DIR="/path/to/output/directory"
# Output will be (1) a PDF forest plot of the regression estimates and CIs shown in Figure 6B
# (2) a file with the regression statistics, as shown in Table S5C

In [None]:
# BMI and covariate data for Europeans
bmi=pd.read_csv(BMI_DATA, compression='gzip', sep='\t')

In [None]:
# Subset data to include only samples used for PheWAS analysis
cov=pd.read_csv(PHEWAS_COV)
geno=pd.read_csv(PHEWAS_GENO)
df=pd.merge(cov, geno, on='id')

In [None]:
df['Sample']=df['id']
bmi['Sample']=bmi.IID
df=pd.merge(df, bmi, on='Sample', how='inner')

In [None]:
def run_model(moddf, input_vars, output_col, mod_type='linear'):
    X=sm.add_constant(moddf[input_vars].to_numpy())
    form=f'{output_col} ~ {" + ".join(input_vars)} + {" + ".join(["del_16p12 * "+(i) for i in ["pa", "alcohol", "smoke", "sleep", "sedentary", "diet", "meds"]])}'
    if mod_type=='linear':
        mod=smf.ols(formula=form, data=moddf)
    elif mod_type=='logistic':
        mod=smf.logit(formula=form, data=moddf)
    
    res=mod.fit()

    # Parse model
    num_vars=form.count('+')+2
    ci=res.conf_int(alpha=0.05)
    if mod_type=='linear':
        r2=res.rsquared
        test='Linear regression'
    elif mod_type=='logistic':
        r2=res.prsquared
        test='Logistic regression'
        
    res_dict={'Phenotype':[output_col]*num_vars, 'Test':[test]*num_vars, 'N':[moddf.shape[0]]*num_vars,
              'Effect size':res.params, 'Error':res.bse, 'p value':res.pvalues, 'R2':[r2]*num_vars}
    mod_res=pd.DataFrame(res_dict)
    
    ci.columns=['95% C.I. lower', '95% C.I. upper']
    mod_res=pd.merge(mod_res, ci, right_index=True, left_index=True)
    
    mod_res['Variable']=mod_res.index.to_list()
    
    return mod_res

In [None]:
cov_cols=['sex', 'age', 'age_2',
          'genetic_pca1', 'genetic_pca2', 'genetic_pca3', 'genetic_pca4', 'genetic_pca5', 'genetic_pca6', 'genetic_pca7', 'genetic_pca8', 'genetic_pca9', 'genetic_pca10',
          'bmi_prs', 'pa', 'alcohol', 'smoke', 'sleep', 'sedentary', 'diet', 'meds']
pheno_col=['bmi']
geno_col=['del_16p12']

df[geno_col[0]]=df['16p12_del']

moddf=df[cov_cols+geno_col+pheno_col]
moddf=moddf[~(moddf.isnull().any(axis=1))]

In [None]:
# Make a plot with only the significant variables of interest
output=run_model(moddf, cov_cols+geno_col, pheno_col[0])
output['order']=output.index.to_list()
plotdf2=output[output.Variable.isin(['bmi_prs', 'pa', 'alcohol', 'smoke', 'sleep', 'sedentary', 'diet', 'meds', 'del_16p12'])]
plotdf2.reset_index(inplace=True, drop=True)
sns.scatterplot(data=plotdf2, x='Effect size', y='order', color='k')
for idx, row in plotdf2.iterrows():
    plt.plot([row['95% C.I. lower'], row['95% C.I. upper']], [row.order, row.order], color='k')
plt.axvline(0, color='grey', ls='--', zorder=0)
plt.yticks(plotdf2.order.to_list(), plotdf2.Variable.to_list())
plt.ylabel('Variable')
plt.tight_layout()
plt.savefig('UKB_BMI_regression_subset_vars.pdf')

In [None]:
# Save plot
dxpy.upload_local_file('UKB_BMI_regression_subset_vars.pdf', folder=OUTPUT_DIR, parents=True)
os.remove('UKB_BMI_regression_subset_vars.pdf')

In [None]:
# Save regression data
output.to_csv('UKB_BMI_regression.csv', index=False)
dxpy.upload_local_file('UKB_BMI_regression.csv', folder=OUTPUT_DIR, parents=True)
os.remove('UKB_BMI_regression.csv')