In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

In [7]:
hrr = pd.read_csv('hrr_phenotypes/ensemble_hrr.csv')
deep_hrr = pd.read_csv('hrr_phenotypes/deep_hrr.tsv', sep='\t')
transfer_hrr = pd.read_csv('hrr_phenotypes/transfer_hrr.tsv', sep='\t')
transfer_hrr = transfer_hrr.merge(deep_hrr, how='outer', on='sample_id', suffixes=('', '_1'))
transfer_hrr['ecg-bike-hrr-ramp_prediction'].loc[transfer_hrr['ecg-bike-hrr-ramp_prediction'].isna()] = \
        transfer_hrr['ecg-bike-hrr-ramp_prediction_1'].loc[transfer_hrr['ecg-bike-hrr-ramp_prediction'].isna()]
hrr10 = pd.read_csv('hrr_phenotypes/hrr10s_phenotype.tsv', sep='\t')
hrr50 = pd.read_csv('hrr_phenotypes/hrr50_phenotype.tsv', sep='\t')
hrr50['sample_id'] = hrr50['FID']
resting_hr = pd.read_csv('hrr_phenotypes/resting_hr_phenotype.tsv', sep='\t').dropna(subset=['resting_hr'])
resting_hr['sample_id'] = resting_hr['FID']
filtered_hrr10 = pd.read_csv('hrr_phenotypes/filtered_hrr10_phenotype.tsv', sep='\t')
phenotypes = {'hrr': [hrr, 'hrr',  'HRR (bpm)'],
              'deep_hrr': [deep_hrr, 'ecg-bike-hrr-ramp_prediction', 'HRR from pretest (bpm)'],
              'transfer_hrr': [transfer_hrr, 'ecg-bike-hrr-ramp_prediction', 'HRR from pretest and resting ECGs (bpm)'],
              'hrr10': [hrr10, 'hrr', 'HRR at 10s (bpm)'],
              'filtered_hrr10': [filtered_hrr10, 'hrr', '(filtered) HRR at 10s (bpm)'],
              'resting_hr': [resting_hr, 'resting_hr', 'Resting HR'],
              'hrr50': [hrr50, 'hrr50', 'Heart rate recovery at 50 s (bpm)']
             }


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
def rank_to_normal(rank, c, n):
    # Standard quantile function
    x = (rank - c) / (n - 2*c + 1)
    return ss.norm.ppf(x)

def rank_based_int(series):
    c=3.0/8
    orig_idx = series.index
    series = series.loc[np.random.permutation(series.index)]
    rank = ss.rankdata(series, method="ordinal")
    rank = pd.Series(rank, index=series.index)
    transformed = rank.apply(rank_to_normal, c=c, n=len(rank))
    return transformed

for phenotype in phenotypes:
    df = phenotypes[phenotype][0]
    col = phenotypes[phenotype][1]
    phenotypes[phenotype].append(rank_based_int(df[col]))
    df['rbint'] = phenotypes[phenotype][-1][df.index]
    df['FID'] = df['sample_id']
    df['IID'] = df['sample_id']
    df[['FID', 'IID', col, 'rbint']].to_csv(f'hrr_phenotypes/phenotype_{phenotype}.tsv', sep='\t', index=False)

In [None]:
f, ax = plt.subplots(len(phenotypes), 3)
f.set_size_inches(16, 16)
for i, phenotype in enumerate(['hrr10', 'filtered_hrr10']):
    df, col, label, rbint = phenotypes[phenotype]
    sns.distplot(df[col], ax=ax[i, 0])
    ax[i, 0].set_xlim([-5, 80])
    ax[i, 0].set_xlabel(label)
    ax[i, 1].plot(df[col], df['rbint'], 'o')
    ax[i, 0].set_xlim([-5, 80])
    ax[i, 1].set_xlabel(label)
    ax[i, 1].set_ylabel('RBINT')
    sns.distplot(df['rbint'], ax=ax[i, 2])
    ax[i, 2].set_xlabel('RBINT')
plt.tight_layout()

In [None]:
f, ax = plt.subplots(1, 2)
f.set_size_inches(6, 3)
for i, phenotype in enumerate(['hrr', 'deep_hrr']):
    df, col, label, rbint = phenotypes[phenotype]
    sns.distplot(df[col], ax=ax[i])
    ax[i].set_xlim([-5, 80])
    ax[i].set_xlabel(label)   
plt.tight_layout()
f.savefig('phenos_easy.png')

# Covariate tables

In [10]:
df_cov = pd.read_csv('bq_age_bmi_sex_center_array.tsv', sep='\t')
df_cov['age_sex'] = df_cov['age']
df_cov.loc[df_cov['sex']==0, 'age_sex'] = -1.0*df_cov.loc[df_cov['sex']==0, 'age_sex']
df_cov['FID'] = df_cov['sample_id']
df_cov['IID'] = df_cov['sample_id']
df_cov[['FID', 'IID', 'age', 'bmi', 'sex', 'center', 'geno_array', 'age_sex', ]].to_csv('covariate_tables_fixed.csv', sep='\t')
df_cov[['FID', 'IID', 'age', 'bmi', 'sex', 'center', 'geno_array', 'age_sex', ]]

Unnamed: 0,FID,IID,age,bmi,sex,center,geno_array,age_sex
0,4439180,4439180,40,30.5959,1,11018,1,40.0
1,5316572,5316572,64,36.0111,1,11001,1,64.0
2,3521022,3521022,67,28.0219,1,11009,1,67.0
3,4067729,4067729,70,29.7796,1,11010,1,70.0
4,3884750,3884750,40,25.6519,0,11010,1,-40.0
...,...,...,...,...,...,...,...,...
486293,1615327,1615327,69,30.8907,1,11011,1,69.0
486294,4787986,4787986,69,27.6638,1,11016,1,69.0
486295,4201068,4201068,69,26.8977,0,11017,1,-69.0
486296,3489641,3489641,69,22.9138,1,11006,1,69.0


In [18]:
#Cross with phenotype
df_cov2 = df_cov.merge(hrr[['sample_id', 'hrr']], left_on=['FID'], right_on=['sample_id'])[['FID', 'IID', 'age', 'bmi', 'sex', 'center', 'geno_array', 'age_sex', 'hrr']]

In [23]:
geno_set = set(df_cov['FID']) - set(df_cov2['FID'])
remove_list = list(geno_set)
remove_df = pd.DataFrame()
remove_df['FID'] = remove_list
remove_df['IID'] = remove_list
remove_df.to_csv('remove_example.tsv', sep='\t', header=None, index=False)

In [22]:
len(remove_list)

432055

In [None]:
# Exclude individuals without phenotype
geno_set = set(df_cov['FID'])
for phenotype in phenotypes:
    df, col, label, rbint = phenotypes[phenotype]
    pheno_set = set(df['FID'])
    remove_list = list(geno_set-pheno_set)
    remove_df = pd.DataFrame()
    remove_df['FID'] = remove_list
    remove_df['IID'] = remove_list
    remove_df.set_index('FID').to_csv(f'hrr_phenotypes/remove_nopheno_{phenotype}.tsv', header=None, sep='\t')

In [14]:
hrr

Unnamed: 0,sample_id,ecg-bike-hrr-raw-file_prediction_x,ecg-bike-hrr-raw-file_actual_x,ecg-bike-hrr-raw-file_prediction_y,ecg-bike-hrr-raw-file_actual_y,hrr,hrr_std
0,1000113,6.900356,0.441175,5.008314,0.441175,4.116615,0.946021
1,1000169,31.476118,33.210198,31.317660,33.210198,32.001325,0.079229
2,1000321,21.480762,19.817792,24.394562,19.817792,21.897705,1.456900
3,1000392,49.384766,55.753271,52.080590,55.753271,52.406209,1.347912
4,1000426,15.281571,14.389004,14.480129,14.389004,14.716901,0.400721
...,...,...,...,...,...,...,...
56121,6026414,21.777676,22.831254,21.839216,22.831254,22.149382,0.030770
56122,6026433,58.223866,65.979054,62.447670,65.979054,62.216863,2.111902
56123,6026543,51.116630,56.124515,50.167892,56.124515,52.469679,0.474369
56124,6026581,28.061596,26.364031,27.674189,26.364031,27.366605,0.193703
