In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import glob

In [2]:
snps_vandervej = pd.read_csv('snps_vandervej.tsv', sep=' ',
                        names=['CHR', 'SNP', 'Position', 'EA(Freq)/NEA', 'Beta', 'SE', 
                               'p1', 'p2', 'p3', 'Candidate gene', 'Trait'])

snps_ramirez_rec = pd.read_csv('snps_ramirez_rec.tsv', sep=' ')
snps_ramirez_ex = pd.read_csv('snps_ramirez_ex.tsv', sep=' ')
snps_ramirez_hr = pd.read_csv('snps_ramirez_hr.tsv', sep='\t')
snps_m2pam = pd.read_csv('snps_m2pam.tsv', sep='\t')

In [3]:
snps_literature = set(snps_vandervej['SNP']) | set(snps_ramirez_rec['SNP']) \
                  | set(snps_ramirez_ex['SNP']) | set(snps_m2pam['SNP'])
snps_literature_df = pd.DataFrame(list(snps_literature))
snps_literature_df.to_csv('snps_literature.tsv', index=False, header=False)

In [4]:
phenos = glob.glob('hrr_phenotypes/phenotype*.tsv')
pheno_dic = {}
for pheno in phenos:
    key = pheno.split('/')[-1].split('.')[-2]
    pheno_dic[key] = pd.read_csv(pheno, sep='\t')
covariates = pd.read_csv('covariate_tables_fixed.csv', sep='\t')
pheno_set = set(covariates['FID'])
for key in pheno_dic:
    pheno_set = pheno_set & set(pheno_dic[key]['FID'])

In [5]:
pheno_yes = pd.DataFrame(list(pheno_set), columns=['FID'])
pheno_yes['IID'] = pheno_yes['FID'].copy(deep=True)
pheno_no = pd.DataFrame(list(set(covariates['FID']) - set(pheno_yes['FID'])), columns=['FID'])
pheno_no['IID'] = pheno_no['FID'].copy(deep=True)
pheno_yes[['FID', 'IID']].to_csv('keep_ids_hrr.tsv', sep='\t', index=False, header=False)
pheno_no[['FID', 'IID']].to_csv('remove_ids_hrr.tsv', sep='\t', index=False, header=False)

In [6]:
covariates = covariates.merge(pheno_yes, on='FID')
covariates.to_csv('covariate_table_fixed_hrr.csv', index=False, sep='\t')

In [None]:
# # Filter GWAS results
# gwas_dirs = glob.glob('gwas_results/*')
# gwases = {}
# for gwas_dir in gwas_dirs:
#     gwas = ''.join(gwas_dir.split('/')[-1].split('_')[2:])
#     output = pd.read_csv(f'{gwas_dir}/bolt_output', sep='\t', 
#                          usecols=['SNP', 'CHR', 'BP', 'BETA', 'ALLELE1', 'ALLELE0', 'P_BOLT_LMM'])
#     imputed = pd.read_csv(f'{gwas_dir}/bolt_imputed_snps', sep='\t', 
#                           usecols=['SNP', 'CHR', 'BP', 'BETA', 'ALLELE1', 'ALLELE0', 'P_BOLT_LMM'])
#     gwases[gwas] = pd.concat([output[output['P_BOLT_LMM']<0.1], imputed[imputed['P_BOLT_LMM']<0.1]])
#     gwases[gwas].to_csv(f'{gwas_dir}/filtered_snps.tsv', sep='\t', index=False)

In [7]:
snps_m2pam

Unnamed: 0,SNP,Weight,A1,A2,MAF
0,rs2350782,0.137661,T,C,0.114009
1,rs6962027,0.132023,T,A,0.456869
2,rs73158705,0.146181,A,G,0.162847
3,rs58394792,0.118127,G,A,0.163854
4,rs28578872,-0.195589,T,C,0.433606
5,rs7800170,-0.06135,C,A,0.492374


In [8]:
# Read GWAS results
gwas_dirs = glob.glob('gwas_results/*')
gwases = {}
for gwas_dir in gwas_dirs:
    gwas = ''.join(gwas_dir.split('/')[-1].split('_')[2:])
    gwases[gwas] = pd.read_csv(f'{gwas_dir}/filtered_snps.tsv', sep='\t')
    idx = gwases[gwas]['SNP'].drop_duplicates().index
    gwases[gwas] = gwases[gwas].loc[idx]

In [9]:
snps_vandervej

Unnamed: 0,CHR,SNP,Position,EA(Freq)/NEA,Beta,SE,p1,p2,p3,Candidate gene,Trait
0,1,rs11589125,31894396,T(0.06)/C,0.075,0.013,6.6,×,10−09,"SERINC2n,c",HRR50
1,1,rs272564,45012273,A(0.71)/C,0.046,0.007,1.4,×,10−12,"RNF220n,h",HRR50
2,1,rs61765646,72723211,A(0.19)/T,0.056,0.008,1.1,×,10−13,NEGR1n,HRR10
3,2,rs1899492,60000304,T(0.47)/C,0.04,0.006,1.7,×,10−11,Gene_desert,HRR40
4,2,rs17362588,179721046,G(0.92)/A,0.062,0.011,3.1,×,10−09,"CCDC141n,c,TTNh",HRR10
5,2,rs35596070,179759692,C(0.86)/A,0.06,0.008,4.2,×,10−13,"CCDC141n,c,TTNh",HRR10
6,3,rs73043051,18883863,C(0.22)/T,0.041,0.007,7.8,×,10−09,KCNH8n,HRR50
7,3,rs34310778,74783408,C(0.43)/T,0.036,0.006,1.0,×,10−09,"CNTN3n,e",HRR30
8,5,rs4836027,121866990,T(0.68)/C,0.05,0.006,1.7,×,10−15,"SNCAIPn,PRDM6n,h",HRinc
9,5,rs151283,122446619,C(0.72)/A,0.042,0.007,1.6,×,10−10,PRDM6nh,HRR50


In [None]:
gwases_cross = {}
for gwas in gwases:
    gwases_cross[gwas] = {}
    gwases_cross[gwas]['ramirez_rec'] = snps_ramirez_rec.merge(gwases[gwas], how='left', on='SNP', indicator=True)
    gwases_cross[gwas]['ramirez_ex'] = snps_ramirez_rec.merge(gwases[gwas], how='left', on='SNP', indicator=True)
    gwases_cross[gwas]['ramirez_hr'] = snps_ramirez_rec.merge(gwases[gwas], how='left', on='SNP', indicator=True)
    gwases_cross[gwas]['vandervej'] = snps_vandervej.merge(gwases[gwas], how='left', on='SNP', indicator=True)
    gwases_cross[gwas]['m2pam'] = snps_m2pam.merge(gwases[gwas], how='left', on='SNP', indicator=True)

In [None]:
import matplotlib
font = {'family' : 'normal',
        'size'   : 16}

matplotlib.rc('font', **font)

f, ax = plt.subplots()
xticklabels = []
xticks = []
gwases_keys = list(gwases.keys())
gwases_keys = ['hrr', 'restinghr', 'deephrr', 'deephrrtransfer']
gwases_labels = ['HRR', 'Resting HR', 'HRR from pretest', 'HRR from pretest and resting']
for i, snp in enumerate(snps_ramirez_rec['SNP']):
    xs = np.arange(len(gwases_keys))*3.0 + i * (len(gwases_keys)*3.0+10)
    xticks.append(np.mean(xs))
    xticklabels.append(snps_ramirez_rec.iloc[i]['Locus'])
    arr = []
    for j, gwas in enumerate(gwases_keys):
        arr.append(gwases_cross[gwas]['ramirez_rec'].iloc[i]['P_BOLT_LMM'])
        ax.bar(xs[j], -np.log10(arr[-1]), label=gwases_labels[j], width=3.0, edgecolor='black',
               color=j*np.array([0.3, 0.3, 0.3]))
    if i == 0 : 
        ax.legend(loc='upper left')
ax.plot([0, np.max(xs)], [-np.log10(0.05/16.0), -np.log10(0.05/16.0)])
ax.plot([0, np.max(xs)], [-np.log10(0.05/1e6), -np.log10(0.05/1e6)])
ax.set_ylabel('$-log_{10}(p-value)$')
f.set_size_inches(16, 9)
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, rotation=45, ha='right')
f.savefig('ramirez_rec.png', dpi=300)

In [None]:
snps_ramirez_rec

In [None]:
f, ax = plt.subplots()
xticklabels = []
xticks = []
gwases_keys = list(gwases.keys())
gwases_keys = ['hrr', 'restinghr', 'deephrr', 'deephrrtransfer']
for i, snp in enumerate(snps_ramirez_ex['SNP']):
    xs = np.arange(len(gwases_keys))*3.0 + i * (len(gwases_keys)*3.0+10)
    xticks.append(np.mean(xs))
    xticklabels.append(snps_ramirez_ex.iloc[i]['Locus'])
    arr = []
    for j, gwas in enumerate(gwases_keys):
        arr.append(gwases_cross[gwas]['ramirez_ex'].iloc[i]['P_BOLT_LMM'])
        ax.bar(xs[j], -np.log10(arr[-1]), label=gwases_labels[j], width=3.0, edgecolor='black',
               color=j*np.array([0.3, 0.3, 0.3]))
    if i == 0 : 
        ax.legend(loc='upper left')
ax.plot([0, np.max(xs)], [-np.log10(0.05/16.0), -np.log10(0.05/16.0)])
ax.plot([0, np.max(xs)], [-np.log10(0.05/1e6), -np.log10(0.05/1e6)])
f.set_size_inches(16, 9)
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, rotation=45, ha='right')
f.savefig('ramirez_ex.png', dpi=300)

In [None]:
snps_ramirez_ex

In [None]:
f, ax = plt.subplots()
xticklabels = []
xticks = []
gwases_keys = list(gwases.keys())
gwases_keys = ['hrr', 'restinghr', 'deephrr', 'deephrrtransfer']
for i, snp in enumerate(snps_vandervej['SNP']):
    xs = np.arange(len(gwases_keys))*8.0 + i * (len(gwases_keys)*8.0+10)
    xticks.append(np.mean(xs))
    xticklabels.append(snps_vandervej.iloc[i]['Candidate gene'].split(',')[0])
    arr = []
    for j, gwas in enumerate(gwases_keys):
        arr.append(gwases_cross[gwas]['vandervej'].iloc[i]['P_BOLT_LMM'])
        ax.bar(xs[j], -np.log10(arr[-1]), label=gwases_labels[j], width=8, edgecolor='black',
               color=j*np.array([0.3, 0.3, 0.3]))
    if i == 0 : 
        ax.legend()
ax.plot([0, np.max(xs)], [-np.log10(0.05/23.0), -np.log10(0.05/23.0)])
ax.plot([0, np.max(xs)], [-np.log10(0.05/1e6), -np.log10(0.05/1e6)])
f.set_size_inches(16, 9)
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, rotation=45, ha='right')
f.savefig('verweij.png', dpi=300)

In [None]:
snps_vandervej

In [None]:
gwases.keys()

In [None]:
f, ax = plt.subplots()
xticklabels = []
xticks = []
gwases_keys = list(gwases.keys())
gwases_keys = ['hrr', 'restinghr', 'deephrr', 'deephrrtransfer']
for i, snp in enumerate(snps_m2pam['SNP']):
    xs = np.arange(len(gwases_keys))*3.0 + i * (len(gwases_keys)*3.0+10)
    xticks.append(np.mean(xs))
    xticklabels.append(snps_m2pam.iloc[i]['SNP'])
    arr = []
    for j, gwas in enumerate(gwases_keys):
        arr.append(gwases_cross[gwas]['m2pam'].iloc[i]['P_BOLT_LMM'])
        ax.bar(xs[j], -np.log10(arr[-1]), label=gwases_labels[j], width=3, edgecolor='black',
               color=j*np.array([0.3, 0.3, 0.3]))
    if i == 0 : 
        ax.legend()
ax.plot([0, np.max(xs)], [-np.log10(0.05/6.0), -np.log10(0.05/6.0)])
ax.plot([0, np.max(xs)], [-np.log10(0.05/1e6), -np.log10(0.05/1e6)])
f.set_size_inches(16, 9)
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, rotation=45, ha='right')
f.savefig('m2pam.png')

In [None]:
gwases['hrr']

In [None]:
def plot_gwas(df_tmp, fig, ax):
    df = df_tmp.sort_values(by=['CHR', 'BP'])
    df['LOGP'] = -np.log10(df['P_BOLT_LMM'])
    df.CHR = df.CHR.astype('category')
    df['ind'] = range(len(df))
    df_grouped = df.groupby(('CHR'))
       
    colors = ['black','silver']
    x_labels = []
    x_labels_pos = []

    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='LOGP',color=colors[num % len(colors)], ax=ax, s=5)
        x_labels.append(name)
        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))   
        
    ax.plot([0.0, group['ind'].iloc[-1]], [7.3, 7.3])
    
    ax.set_xticks(x_labels_pos)
    ax.set_xticklabels(x_labels)
    ax.set_xlim([0, len(df)])
    ax.set_ylim([1, 20])
    ax.set_xlabel('Chromosome')
    ax.set_ylabel('$-\log_{10}(p-value)$')

In [None]:
for gwas in gwases:
    f, ax = plt.subplots()
    f.set_size_inches(22, 6)
    plot_gwas(gwases[gwas], f, ax)
    f.savefig(f'manhattan_{gwas}.png', dpi=300)

In [None]:
from assocplots.qqplot import *

In [None]:
# Filter GWAS results
gwas_dirs = glob.glob('gwas_results/*')
gwases_outputs = {}
for gwas_dir in gwas_dirs:
    gwas = ''.join(gwas_dir.split('/')[-1].split('_')[2:])
    output = pd.read_csv(f'{gwas_dir}/bolt_output', sep='\t', 
                         usecols=['SNP', 'CHR', 'BP', 'BETA', 'ALLELE1', 'ALLELE0', 'P_BOLT_LMM'])
    # imputed = pd.read_csv(f'{gwas_dir}/bolt_imputed_snps', sep='\t', 
    #                      usecols=['SNP', 'CHR', 'BP', 'BETA', 'ALLELE1', 'ALLELE0', 'P_BOLT_LMM'])
    gwases_outputs[gwas] = output

In [None]:
for gwas in gwases_outputs:
    f, ax = plt.subplots()
    f.set_size_inches(6, 6)
    qqplot([gwases_outputs[gwas]['P_BOLT_LMM']], 
           [gwas], 
           color=['k'], 
           fill_dens=[0.2], 
           error_type='theoretical', 
           distribution='beta',
           title='')
    f.savefig(f'qq_{gwas}.png', dpi=300)

In [None]:
f, ax = plt.subplots()
f.set_size_inches(5, 5)
plt.sca(ax)
qqplot([gwas_dic['HRR']['P_BOLT_LMM']], 
       ['HRR (UKBB)'], 
       color=['k'], 
       fill_dens=[0.2], 
       error_type='theoretical', 
       distribution='beta',
       title='HRR (UKBB)')
f.savefig('HRR_qq.png')

# plt.sca(ax[1])
# qqplot([df_deep_hrr['P_BOLT_LMM']], 
#        ['HRR from pretest'], 
#        color=['k'], 
#        fill_dens=[0.2], 
#        error_type='theoretical', 
#        distribution='beta',
#        title='HRR from pretest')

# plt.sca(ax[2])
# qqplot([df_deep_transfer['P_BOLT_LMM']], 
#        ['HRR from pretest and resting ECGs'], 
#        color=['k'], 
#        fill_dens=[0.2], 
#        error_type='theoretical', 
#        distribution='beta',
#        title='HRR from pretest and resting ECGs')