In [1]:
import pandas as pd
import numpy as np
import dxpy
import os

In [None]:
# Input and output files
INPUT_BURDEN="/path/to/input/16p12/sample/burden.csv" # Use the output for 16p12.1 deletion samples from script 3_identify_samples.ipynb
PHENO="/path/to/input/phenotype/data.csv" # Use the output of script 2_gather_sample_data.ipynb
DATA_ENCODING_PATH="/path/to/data/encoding/files/" # UKB phenotypes use specific encodings. Add encoding TSV files to this directory to parse the phenotypes
OUTPUT_PATH="/path/to/output/files"

In [None]:
# Parse phenotype data for 16p12.1 deletion samples
df=pd.read_csv(INPUT_BURDEN)

In [None]:
# Parse phenotype data
pheno=pd.read_csv(PHENO)
# Rename columns
field_name_dict = {'Sample':'eid', 'yob':'p34', 'sex':'p22001',
                    'white_british_genetic':'p22006',
                    'ethnic_background1':'p21000_i0', 'ethnic_background2':'p21000_i1', 'ethnic_background3':'p21000_i2', 'ethnic_background4':'p21000_i3',
                    'recommend_for_exclusion':'p22010',
                    'icd10':'p41270',
                    'depressed_mood':'p20446', 'lost_interest':'p20441',
                    'sleep_trouble1':'p1200_i0', 'sleep_trouble2':'p1200_i1', 'sleep_trouble3':'p1200_i2', 'sleep_trouble4':'p1200_i3',
                    'mood_lability1':'p1920_i0', 'mood_lability2':'p1920_i1', 'mood_lability3':'p1920_i2', 'mood_lability4':'p1920_i3',
                    'anxious_feeling':'p20421', 'worried_more':'p20425', 'drugs_for_anxiety':'p20549', 'anxiety_interferes_life':'p20418',
                    'drinking_interferes_life':'p20407', 'unable_stop_drinking':'p20413', 'drug_addiction':'p20456',
                    'unreal_sounds':'p20463', 'unreal_visions':'p20471', 'conspiracy':'p20468'
}
for i in range(1, 11):
    field_name_dict['PC'+str(i)]='p22009_a'+str(i)
inv_map = {v: k for k, v in field_name_dict.items()}
cols=[inv_map[i] for i in pheno.columns.to_list()]
pheno.columns=cols

In [4]:
pheno=pheno[pheno.Sample.isin(df.Sample.to_list())]

In [5]:
# Separate questionnaire data
quest=pheno[['Sample', 'depressed_mood', 'lost_interest',
             'sleep_trouble1', 'sleep_trouble2', 'sleep_trouble3', 'sleep_trouble4',
             'mood_lability1', 'mood_lability2', 'mood_lability3', 'mood_lability4',
             'anxious_feeling', 'worried_more', 'drugs_for_anxiety', 'anxiety_interferes_life',
             'drinking_interferes_life', 'unable_stop_drinking', 'drug_addiction',
             'unreal_sounds', 'unreal_visions', 'conspiracy']].copy()

In [None]:
# Update data encoding
coding_dict={'depressed_mood':'503', 'lost_interest':'503',
             'anxious_feeling':'502', 'worried_more':'502', 'drugs_for_anxiety':'1405', 'anxiety_interferes_life':'510',
             'drinking_interferes_life':'523', 'unable_stop_drinking':'523', 'drug_addiction':'502',
             'unreal_sounds':'502', 'unreal_visions':'502', 'conspiracy':'502'}
for i in range(1, 5):
    coding_dict['sleep_trouble'+str(i)]='100343'
    coding_dict['mood_lability'+str(i)]='100349'

quest.fillna('', inplace=True)

for col in coding_dict.keys():
    coding=pd.read_csv('DATA_ENCODING_PATH/coding'+coding_dict[col]+'.tsv', sep='\t')
    coding.index=coding.coding
    encoding=coding.meaning.to_dict()

    quest[col]=quest[col].map(encoding)

In [7]:
# Convert questionnaire responses into binary results for broad phenotypes
positive_response={'depressed_mood':'Yes', 'lost_interest':'Yes',
                   'anxious_feeling':'Yes', 'worried_more':'Yes',
                   'drugs_for_anxiety':'Unprescribed medication (more than once)|Medication prescribed to you (for at least two weeks)|Drugs or alcohol (more than once)',
                   'anxiety_interferes_life':'Somewhat|A lot',
                   'drinking_interferes_life':'Less than monthly|Monthly|Weekly|Daily or almost daily',
                   'unable_stop_drinking':'Less than monthly|Monthly|Weekly|Daily or almost daily', 'drug_addiction':'Yes',
                   'unreal_sounds':'Yes', 'unreal_visions':'Yes', 'conspiracy':'Yes'}
for i in range(1, 5):
    positive_response['sleep_trouble'+str(i)]='Sometimes|Usually'
    positive_response['mood_lability'+str(i)]='Yes'

pheno_groups={'depression':['depressed_mood', 'lost_interest'],
              'sleep':['sleep_trouble'+str(i) for i in range(1, 5)],
              'mood':['mood_lability'+str(i) for i in range(1, 5)],
              'anxiety':['anxious_feeling', 'worried_more', 'drugs_for_anxiety', 'anxiety_interferes_life'],
              'addiction':['drinking_interferes_life', 'unable_stop_drinking', 'drug_addiction'],
              'psychosis':['unreal_sounds', 'unreal_visions', 'conspiracy']}

for pg in pheno_groups.keys():
    quest[pg]=0
    quest.loc[quest[pheno_groups[pg]].isnull().all(axis=1), pg]=np.nan
    for col in pheno_groups[pg]:
        pos_resp=positive_response[col].split('|')
        quest.loc[quest[col].isin(pos_resp), pg]=1

In [None]:
# Save questionnaire responses to file
quest[['Sample', 'depression', 'sleep', 'mood', 'anxiety', 'addiction', 'psychosis']].to_csv('questionnaire_phenotypes.csv', index=False)
dxpy.upload_local_file('questionnaire_phenotypes.csv', folder=OUTPUT_PATH, parents=True)
os.remove('questionnaire_phenotypes.csv')