In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import dxpy
import os

In [None]:
# Input and output files
BURDEN_16P="/path/to/input/16p12/sample/burden.csv" # Use the output for 16p12.1 deletion samples from script 3_identify_samples.ipynb
BURDEN_CONTROL="/path/to/input/control/sample/burden.csv" # Use the output for control samples from script 3_identify_samples.ipynb
PHENO="/path/to/input/phenotype/data.csv" # Use the output of script 2_gather_sample_data.ipynb
DATA_CODING_19_PATH="/path/to/coding19.tsv" # UKB ICD10 phenotypes are encoded using Data-coding 19
OUTPUT_PATH="/path/to/output/files"

In [None]:
# Parse phenotype data for 16p12.1 deletion samples and controls
df=pd.read_csv(BURDEN_16P)
df2=pd.read_csv(BURDEN_CONTROL)

In [None]:
# Parse phenotype data
pheno=pd.read_csv(PHENO)
# Rename columns
field_name_dict = {'Sample':'eid', 'yob':'p34', 'sex':'p22001',
                    'white_british_genetic':'p22006',
                    'ethnic_background1':'p21000_i0', 'ethnic_background2':'p21000_i1', 'ethnic_background3':'p21000_i2', 'ethnic_background4':'p21000_i3',
                    'recommend_for_exclusion':'p22010',
                    'icd10':'p41270',
                    'depressed_mood':'p20446', 'lost_interest':'p20441',
                    'sleep_trouble1':'p1200_i0', 'sleep_trouble2':'p1200_i1', 'sleep_trouble3':'p1200_i2', 'sleep_trouble4':'p1200_i3',
                    'mood_lability1':'p1920_i0', 'mood_lability2':'p1920_i1', 'mood_lability3':'p1920_i2', 'mood_lability4':'p1920_i3',
                    'anxious_feeling':'p20421', 'worried_more':'p20425', 'drugs_for_anxiety':'p20549', 'anxiety_interferes_life':'p20418',
                    'drinking_interferes_life':'p20407', 'unable_stop_drinking':'p20413', 'drug_addiction':'p20456',
                    'unreal_sounds':'p20463', 'unreal_visions':'p20471', 'conspiracy':'p20468'
}
for i in range(1, 11):
    field_name_dict['PC'+str(i)]='p22009_a'+str(i)
inv_map = {v: k for k, v in field_name_dict.items()}
cols=[inv_map[i] for i in pheno.columns.to_list()]
pheno.columns=cols

In [4]:
# Get ICD10 codes for 16p12.1 deletion samples and controls
pheno=pheno[pheno.Sample.isin(df.Sample.to_list()+df2.Sample.to_list())][['Sample', 'icd10']]
pheno=pheno[~pheno.icd10.isnull()]
pheno.icd10=pheno.icd10.str.replace('[', '', regex=False).str.replace(']', '', regex=False)
pheno['coding']=pheno.icd10.str.split(', ')
icd10=pheno.explode('coding')

In [None]:
# Map coding to ICD10 codes
coding=pd.read_csv(DATA_CODING_19_PATH, sep='\t')
# Annotate terms hierarchical categories
# Chapter (i.e. "Chapter I Certain infectious and parasitic diseases")
# Block (i.e. "A00-A09 Intestinal infectious diseases")
# Sub-block (i.e. "A00 Cholera")
coding['label']='.'
coding.loc[coding.coding.str.contains('Chapter'), 'label']='Chapter'
coding.loc[coding.coding.str.contains('Block'), 'label']='Block'
coding.loc[~(coding.meaning.str.contains('\\.')) & (~coding.coding.str.contains('Block')) & (~coding.coding.str.contains('Chapter')), 'label']='Sub-block'

In [6]:
# Reorganize coding as hierarchy
hierarchy=nx.DiGraph()
hierarchy.add_nodes_from(coding.node_id.to_list())
# Add parent-child edges
parent=[tuple(r) for r in coding[coding.parent_id!=0][['parent_id', 'node_id']].to_numpy()]
hierarchy.add_edges_from(parent)

In [7]:
# Assign each node a chapter, block, and sub-block
chapters=[]
blocks=[]
subblocks=[]
for node in coding.node_id.to_list():
    preds=[node]+list(hierarchy.predecessors(node))
    while True:
        old_preds=preds.copy()
        for p in old_preds:
            preds+=list(hierarchy.predecessors(p))
        preds=sorted(list(set(preds)))
        if len(preds)==len(old_preds):
            break
    
    for i in range(3):
        lst=[chapters, blocks, subblocks][i]
        lab=['Chapter', 'Block', 'Sub-block'][i]
        subdf=coding[(coding.node_id.isin(preds)) & (coding.label==lab)]
        if subdf.shape[0]>0:
            lst.append(subdf.node_id.to_list()[0])
        else:
            lst.append('.')
    
coding['Chapter']=chapters
coding['Block']=blocks
coding['Sub-block']=subblocks

In [8]:
# Assign chapters to 16p12.1 sample ICD10 codes
icd10=pd.merge(icd10, coding[['node_id', 'coding', 'meaning', 'Chapter', 'Block', 'Sub-block']], on='coding', how='left')

In [9]:
# Reformat sample data by Chapter
chapdf=icd10[['Sample', 'Chapter', 'coding']].groupby(['Sample', 'Chapter']).agg('count')
max_val=chapdf.coding.max()
chapdf.reset_index(inplace=True)

chapdf=pd.pivot(chapdf, index='Sample', columns='Chapter', values='coding')
chapdf.fillna(0, inplace=True)
chapdf=chapdf.astype(int)
for i in range(2, max_val+1):
    chapdf=chapdf.replace(i, 1)
chapdf.reset_index(inplace=True)

In [10]:
# Subset and rename relevant chapter columns
chapdf=chapdf[['Sample',
               20, 30, 40, 50, 60,
               70, 80, 90, 100, 110, 120,
               130, 140, 150, 170]]
chapdf.columns=['Sample',
                'Neoplasms', 'Blood', 'Endocrine/Metabolic', 'Mental/behavioral disorders','Nervous system',
                'Eye', 'Ear', 'Circulatory system', 'Respiratory system', 'Digestive system', 'Skin/subcutaeous tissue',
                'Musc. system/connective tissue', 'Genitourinary system', 'Pregnancy/childbirth', 'Congenital malformations']

In [None]:
# Save
chapdf[chapdf.Sample.isin(df.Sample.to_list())].to_csv('ICD10_16p12_Chapter.csv', index=False)
dxpy.upload_local_file('ICD10_16p12_Chapter.csv', folder=OUTPUT_PATH, parents=True)
os.remove('ICD10_16p12_Chapter.csv')

In [None]:
# Save control data
chapdf[chapdf.Sample.isin(df2.Sample.to_list())].to_csv('ICD10_control_Chapter.csv', index=False)
dxpy.upload_local_file('ICD10_control_Chapter.csv', folder=OUTPUT_PATH, parents=True)
os.remove('ICD10_control_Chapter.csv')

In [13]:
# Gather broad phenotype information from ICD10 codes
# ICD10 codes we need:
# Node ID - meaning [type]
# 29110 - F32 Depressive episode [Sub-block]
# 29940 - F51 Nonorganic sleep disorders [Sub-block]
# 29420 - F41 Other anxiety disorders [Sub-block]
# 27560 - F10.2 Dependence syndrome (Alcohol) [node]
# [27670, 27780, 27890, 28000, 28110, 28220, 28440, 28550] -  F[11-16, 18-19].2 Dependence syndrome (Drugs) [node]
# 780 - F20-29 Schizophrenia, schizotypal and delusional disorders [Block]
nodes={27560:'addiction', 27670:'addiction', 27780:'addiction', 27890:'addiction',
       28000:'addiction', 28110:'addiction', 28220:'addiction', 28440:'addiction', 28550:'addiction'}
subblock={29940:'sleep', 29110:'depression', 29420:'anxiety'}
block={780:'psychosis'}

icd_interp=pd.DataFrame(0, index=list(icd10['Sample'].unique()), columns=['sleep', 'addiction', 'depression', 'anxiety', 'psychosis'])

for n in nodes.keys():
    icd_interp.loc[icd10[icd10.node_id==n]['Sample'].to_list(), nodes[n]]=1
for sb in subblock.keys():
    icd_interp.loc[icd10[icd10['Sub-block']==sb]['Sample'].to_list(), subblock[sb]]=1
icd_interp.loc[icd10[icd10.Block==780]['Sample'].to_list(), 'psychosis']=1

icd_interp['Sample']=icd_interp.index.to_list()
icd_interp=icd_interp[['Sample', 'sleep', 'addiction', 'depression', 'anxiety', 'psychosis']]

In [None]:
# Save
icd_interp.to_csv('ICD_interpretation.csv', index=False)
dxpy.upload_local_file('ICD_interpretation.csv', folder=OUTPUT_PATH, parents=True)
os.remove('ICD_interpretation.csv')