In [None]:
##import libraries, establish connection
import warnings
import pandas as pd
import numpy as np 
import scipy.stats as stats
import csv
import scipy
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn import metrics
from sklearn.linear_model import LogisticRegression



In [None]:
# input the PRS file
pgs_file = 'PGS002037'
prs = pd.read_csv("./path/to/pgs/file".format(pgs_file), sep ='\t')
prs.head()

In [None]:
#input the phenotype file
pheno_df = pd.read_csv('./path/to/phenotype/file', sep =',')
pheno_df.head()

In [None]:
#for ancestry specific PRS association testing
#change ancestry names and repeat for each GIA
ancestry = 'EUR'
pc_df = pd.read_csv(('./path/to/precomputed/pcs').format(ancestry), sep ='\t')
pc_df.head()

In [None]:
#merge the main df with PCs for ancestry specific PRS association testing
prs_pheno_eur = pheno_df.merge(pc_df, left_on='', right_on='')
prs_pheno_eur.head()

In [None]:
#Z transform the PRS to get OR per SD within each ancestry
prs_pheno_eur['PRS_Z_eur'] = (prs_pheno_eur['PRS'] - prs_pheno_eur['PRS'].mean())/prs_pheno_eur['PRS'].std()
prs_pheno_eur.head()

In [None]:
#standardized within ancestry
try :
    phecode = '318.0'
    prs_pheno_eur['test'] = prs_pheno_eur[phecode]
    formula = 'test ~ PRS_Z_eur + Age + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + insurance'
    m1 = smf.logit(formula = formula , data= prs_pheno_eur).fit(start_params=None, maxiter=35,method='newton')
    LRresult = (m1.summary2().tables[1])
    LRresult['phecode'] = phecode
    LRresult['GIA'] = ('{}'.format(ancestry))
    LRresult
except sm_tools.sm_exceptions.PerfectSeparationError:
    print('--PerfectSeparationError--')
    pass
LRresult

In [None]:
prs_pheno_eur['prs_cut'] = pd.qcut(prs_pheno_eur.PRS_Z_eur, q=5,
                      labels=['1',
                                '2',
                                '3',
                                '4',
                                '5'])
prs_pheno_eur.head()

In [None]:
try :
    phecode = '318.0'
    prs_phenotype_eur['test'] = prs_phenotype_eur[phecode]
    formula = 'test ~ prs_cut + PatientAge + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + ins_class'
    m1 = smf.logit(formula = formula , data= prs_phenotype_eur).fit(start_params=None, maxiter=35,method='newton')
    LRresult = (m1.summary2().tables[1])
    LRresult['phecode'] = phecode
    LRresult['GIA'] = ('{}'.format(ancestry))
    LRresult
except sm_tools.sm_exceptions.PerfectSeparationError:
    print('--PerfectSeparationError--')
    pass
LRresult

In [None]:
####obesity, alcohol related disorders and lung cancer across risk quantiles pan ancestry

In [None]:
new_df = pd.concat([prs_phenotype_eur, prs_phenotype_amr, prs_phenotype_eas, prs_phenotype_afr], axis=0)
new_df

In [None]:
#repeat for obesity and lung ca phecode
try :
    phecode = '317.0'
    new_df['test'] = new_df[phecode]
    formula = 'test ~ prs_cut + PatientAge + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + insurance'
    m1 = smf.logit(formula = formula , data= new_df).fit(start_params=None, maxiter=35,method='newton')
    LRresult = (m1.summary2().tables[1])
    LRresult['phecode'] = phecode
    LRresult
except sm_tools.sm_exceptions.PerfectSeparationError:
    print('--PerfectSeparationError--')
    pass
LRresult

In [None]:
smoker_df = new_df.loc[new_df['smoking_behav'] > 0]
smoker_df

In [None]:
try :
    phecode = '317.0'
    smoker_df['test'] = smoker_df[phecode]
    formula = 'test ~ prs_cut + PatientAge + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + ins_class'
    m1 = smf.logit(formula = formula , data= smoker_df).fit(start_params=None, maxiter=35,method='newton')
    LRresult = (m1.summary2().tables[1])
    LRresult['phecode'] = phecode
    LRresult
except sm_tools.sm_exceptions.PerfectSeparationError:
    print('--PerfectSeparationError--')
    pass
LRresult

In [None]:
never_smoker_df = new_df.loc[new_df['smoking_behav'] == 0]
never_smoker_df

In [None]:
try :
    phecode = '317.0'
    never_smoker_df['test'] = never_smoker_df[phecode]
    formula = 'test ~ prs_cut + PatientAge + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + ins_class'
    m1 = smf.logit(formula = formula , data= never_smoker_df).fit(start_params=None, maxiter=35,method='newton')
    LRresult = (m1.summary2().tables[1])
    LRresult['phecode'] = phecode
    LRresult
except sm_tools.sm_exceptions.PerfectSeparationError:
    print('--PerfectSeparationError--')
    pass
LRresult