In [None]:
import pandas as pd
import os
import dxpy

In [None]:
# Input and output files
PHENO="/path/to/input/phenotype/data.csv" # Use the output of script 2_gather_sample_data.ipynb
CNV_INPUT="/path/to/input/cnvs.bed" # Use tht ouput from script Variant calling/UKB/2_CNV_annotation/6_frequency_filter.py
CNV_QC_INPUT="/path/to/input/cnv/qc/data.csv" # Use the QC file ouput from Variant calling/UKB/2_CNV_annotation/2_check_QC.py
WITHDRAWN_CONSENT="/path/to/file/with/samples/that/withdrew/consent.csv" # After initial analysis, we were informed some samples withdrew consent
DATA_CODING_19_PATH="/path/to/coding19.tsv" # UKB ICD10 phenotypes are encoded using Data-coding 19
OUTPUT_PATH="/path/to/output/files"

In [None]:
# Create files for performing PheWAS
pheno=pd.read_csv(PHENO)
# Rename columns
field_name_dict = {'Sample':'eid', 'yob':'p34', 'sex':'p22001',
                    'white_british_genetic':'p22006',
                    'ethnic_background1':'p21000_i0', 'ethnic_background2':'p21000_i1', 'ethnic_background3':'p21000_i2', 'ethnic_background4':'p21000_i3',
                    'recommend_for_exclusion':'p22010',
                    'icd10':'p41270',
                    'depressed_mood':'p20446', 'lost_interest':'p20441',
                    'sleep_trouble1':'p1200_i0', 'sleep_trouble2':'p1200_i1', 'sleep_trouble3':'p1200_i2', 'sleep_trouble4':'p1200_i3',
                    'mood_lability1':'p1920_i0', 'mood_lability2':'p1920_i1', 'mood_lability3':'p1920_i2', 'mood_lability4':'p1920_i3',
                    'anxious_feeling':'p20421', 'worried_more':'p20425', 'drugs_for_anxiety':'p20549', 'anxiety_interferes_life':'p20418',
                    'drinking_interferes_life':'p20407', 'unable_stop_drinking':'p20413', 'drug_addiction':'p20456',
                    'unreal_sounds':'p20463', 'unreal_visions':'p20471', 'conspiracy':'p20468'
}
for i in range(1, 11):
    field_name_dict['PC'+str(i)]='p22009_a'+str(i)
inv_map = {v: k for k, v in field_name_dict.items()}
cols=[inv_map[i] for i in pheno.columns.to_list()]
pheno.columns=cols

In [None]:
# Identify 16p12.1 deletion samples and any other samples that passed microarray QC
cnvs=pd.read_csv(CNV_INPUT, sep='\t')
cnvs['Sample']=cnvs.Sample.astype(int)

samp_16p=cnvs[(cnvs.Pathogenic_Name=='16p12.1') & (cnvs.Type=='DEL')].Sample.to_list()
cnv_qc=pd.read_csv(CNV_QC_INPUT, sep='\t', low_memory=False)
cnv_qc=cnv_qc[(cnv_qc.Pass) & (cnv_qc.X_Pass)]
cnv_qc['Sample']=cnv_qc.File.str.split('/', expand=True)[-1].str.split('_', expand=True)[0].astype(int)

cnv_qc['16p12_del']=0
cnv_qc.loc[cnv_qc.Sample.isin(samp_16p), '16p12_del']=1

In [None]:
# Remove any samples that do not have white british ancestry
pheno=pheno[pheno.white_british_genetic==1]

# Remove any samples that could not be assessed for CNVs
pheno=pheno[pheno.Sample.isin(cnv_qc.Sample.to_list())][['Sample', 'yob', 'sex', 'icd10', 'PC1', 'PC2', 'PC3', 'PC4']]
pheno=pheno[~pheno.icd10.isnull()]

# Remove any samples with withdrawn consent
withdrawl=pd.read_csv(WITHDRAWN_CONSENT, header=None, names=['Sample'])
pheno=pheno[(~pheno.Sample.isin(withdrawl.Sample.to_list()))]

In [5]:
# Explode by ICD10
pheno.icd10=pheno.icd10.str.replace('[', '', regex=False).str.replace(']', '', regex=False)
pheno['coding']=pheno.icd10.str.split(', ')
icd10=pheno.explode('coding')

In [None]:
# Map coding to ICD10 codes
coding=pd.read_csv(DATA_CODING_19_PATH, sep='\t')
icd10=pd.merge(icd10, coding, on='coding', how='left')

In [7]:
icd10['code']=icd10.meaning.str.split(' ', expand=True)[0]
icd10['count']=1
icd10['vocabulary_id']='ICD10'

In [None]:
# Save just the codes for PheWAS
icd10['id']=icd10.Sample
icd10[['id', 'code', 'count', 'vocabulary_id']].to_csv('PheWAS_ICD_input.csv', index=False)
dxpy.upload_local_file('PheWAS_ICD_input.csv', folder=OUTPUT_PATH, parents=True)
os.remove('PheWAS_ICD_input.csv')

In [None]:
# Save the covariate information
cov=pheno[['Sample', 'yob', 'sex', 'PC1', 'PC2', 'PC3', 'PC4']].copy()
cov['id']=cov.Sample
cov[['id', 'yob', 'sex', 'PC1', 'PC2', 'PC3', 'PC4']].to_csv('PheWAS_covariates.csv', index=False)
dxpy.upload_local_file('PheWAS_covariates.csv', folder=OUTPUT_PATH, parents=True)
os.remove('PheWAS_covariates.csv')

In [None]:
# Save the genotype information
geno=pd.merge(cov, cnv_qc[['Sample', '16p12_del']], on='Sample', how='left')
geno[['id', '16p12_del']].to_csv('PheWAS_genotype.csv', index=False)
dxpy.upload_local_file('PheWAS_genotype.csv', folder=OUTPUT_PATH, parents=True)
os.remove('PheWAS_genotype.csv')