In [1]:
import pandas as pd
import random
import dxpy
import os

In [None]:
# Input and output files
CNV_INPUT="/path/to/input/cnvs.bed" # Use tht ouput from script 1_Variant calling/UKB/2_CNV_annotation/6_frequency_filter.py
CNV_QC_INPUT="/path/to/input/cnv/qc/data.csv" # Use the QC file ouput from 1_Variant calling/UKB/2_CNV_annotation/2_check_QC.py
CNV_BURDEN="/path/to/input/cnv/burden/data.csv" # Use the CNV burden table generated in script 1_parse_burden.ipynb
SNV_BURDEN="/path/to/input/snv/burden/data.csv" # Use the SNV burden table generated in script 1_parse_burden.ipynb
PHENO="/path/to/input/phenotype/data.csv" # Use the output of script 2_gather_sample_data.ipynb
WITHDRAWN_CONSENT="/path/to/file/with/samples/that/withdrew/consent.csv" # After initial analysis, we were informed some samples withdrew consent
OUTPUT_PATH="/path/to/output/files"

In [None]:
# Identify 16p12.1 deletion samples and relevant controls
cnvs=pd.read_csv(CNV_INPUT, sep='\t')
cnvs['Sample']=cnvs.Sample.astype(int)

samp_16p=cnvs[(cnvs.Pathogenic_Name=='16p12.1') & (cnvs.Type=='DEL')].Sample.to_list()
to_rm=list(cnvs[(cnvs.Length>=500000) | (cnvs.Pathogenic_Name!='.')].Sample.unique())

cnv_qc=pd.read_csv(CNV_QC_INPUT, sep='\t', low_memory=False)
cnv_qc=cnv_qc[(cnv_qc.Pass) & (cnv_qc.X_Pass)]

cnv_qc['Sample']=cnv_qc.File.str.split('/', expand=True)[-1].str.split('_', expand=True)[0].astype(int)
cnv_qc=cnv_qc[(~cnv_qc.Sample.isin(to_rm)) | (cnv_qc.Sample.isin(samp_16p))]


cnv_qc['Case_Control']='No CNV Control'
cnv_qc.loc[cnv_qc.Sample.isin(samp_16p), 'Case_Control']='16p12.1 deletion'
df=cnv_qc[['Sample', 'Case_Control']].copy()

In [None]:
# Add CNV burden
cnv_burden=pd.read_csv(CNV_BURDEN)

df=pd.merge(df, cnv_burden, on='Sample', how='left')
# Fill any CNV NAs with 0 as all samples have CNV data
df.fillna(0, inplace=True)

In [None]:
# Add SNV burden
snv_burden=pd.read_csv(SNV_BURDEN)
df=pd.merge(df, snv_burden, on='Sample', how='left')

In [None]:
# Add in age and sex information to identify age and sex matched controls for the 16p12.1 deletion carriers
pheno=pd.read_csv(PHENO)

In [6]:
# Rename columns
field_name_dict = {'Sample':'eid', 'yob':'p34', 'sex':'p22001',
                    'white_british_genetic':'p22006',
                    'ethnic_background1':'p21000_i0', 'ethnic_background2':'p21000_i1', 'ethnic_background3':'p21000_i2', 'ethnic_background4':'p21000_i3',
                    'recommend_for_exclusion':'p22010',
                    'icd10':'p41270',
                    'depressed_mood':'p20446', 'lost_interest':'p20441',
                    'sleep_trouble1':'p1200_i0', 'sleep_trouble2':'p1200_i1', 'sleep_trouble3':'p1200_i2', 'sleep_trouble4':'p1200_i3',
                    'mood_lability1':'p1920_i0', 'mood_lability2':'p1920_i1', 'mood_lability3':'p1920_i2', 'mood_lability4':'p1920_i3',
                    'anxious_feeling':'p20421', 'worried_more':'p20425', 'drugs_for_anxiety':'p20549', 'anxiety_interferes_life':'p20418',
                    'drinking_interferes_life':'p20407', 'unable_stop_drinking':'p20413', 'drug_addiction':'p20456',
                    'unreal_sounds':'p20463', 'unreal_visions':'p20471', 'conspiracy':'p20468'
}
for i in range(1, 11):
    field_name_dict['PC'+str(i)]='p22009_a'+str(i)
inv_map = {v: k for k, v in field_name_dict.items()}
cols=[inv_map[i] for i in pheno.columns.to_list()]
pheno.columns=cols

In [None]:
age_sex=pheno[['Sample', 'yob', 'sex']]
df=pd.merge(df, age_sex, on='Sample', how='inner')

In [None]:
# After initial analysis, we were informed additional individuals had withdrawn consent
# Remove any samples with withdrawn consent
withdrawl=pd.read_csv(WITHDRAWN_CONSENT, header=None, names=['Sample'])
df=df[(df.Sample>0)]
df=df[(~df.Sample.isin(withdrawl.Sample.to_list()))]

No CNV Control      407312
16p12.1 deletion       249
Name: Case_Control, dtype: int64
(407561, 16)
(407561, 16)
(407544, 16)
No CNV Control      407295
16p12.1 deletion       249
Name: Case_Control, dtype: int64


In [9]:
# Identify age and sex matched controls for comparison
df['age_sex']=df.yob.astype(str)+'.'+df.sex.astype(str)

case_as=pd.DataFrame(df[(df.Case_Control=='16p12.1 deletion') & (~df.Missense.isnull())]['age_sex'].value_counts())
case_as.columns=['case_num']

case_as['nocnv_num']=case_as.index.map(df[(df.Case_Control=='No CNV Control') & (~df.Missense.isnull())]['age_sex'].value_counts().to_dict())
case_as['nocnv_max_vals']=(case_as.nocnv_num/case_as.case_num).astype(int)

# Choose the number of controls needed for each age
case_as['nocnv_controls_needed']=case_as.case_num*min(case_as.nocnv_max_vals.to_numpy())

In [10]:
# Randomly select the needed controls
contdf=df[(df.Case_Control!='16p12.1 deletion') & (~df.Missense.isnull())].copy()
contdf['keep']=False
age_sex=list(case_as.index.unique())
age_sex.sort()
for ags in age_sex:
    poss_samps=contdf[contdf.age_sex==ags].copy()

    # No CNV
    needed=case_as.loc[ags, 'nocnv_controls_needed']
    random.seed(205)
    chosen=random.sample(contdf[contdf.age_sex==ags]['Sample'].to_list(), needed)
    contdf.loc[contdf.Sample.isin(chosen), 'keep']=True
contdf=contdf[contdf.keep]

In [11]:
# Save control samples to file
contdf=contdf[['Sample', 'yob', 'sex', 'age_sex',
               'All_coding_SNVs', 'All_coding_SNVs_LF', 'Missense', 'Missense_LF', 'LOF', 'LOF_LF', 'Splice', 'Splice_LF',
               'Genes_del', 'Genes_del_LF', 'Genes_dup', 'Genes_dup_LF']]
contdf.columns=['Sample', 'YOB', 'Sex', 'age_sex',
                'All coding SNVs', 'All coding SNVs (LF)', 'Missense', 'Missense (LF)', 'LOF', 'LOF (LF)', 'Splice', 'Splice (LF)',
                'Genes del.', 'Genes del. (LF)', 'Genes dup.', 'Genes dup. (LF)']
int_cols=['All coding SNVs', 'All coding SNVs (LF)', 'Missense', 'Missense (LF)', 'LOF', 'LOF (LF)', 'Splice', 'Splice (LF)', 'Genes del.', 'Genes del. (LF)', 'Genes dup.', 'Genes dup. (LF)']
contdf[int_cols]=contdf[int_cols].astype(int)

In [None]:
contdf.to_csv('control_burden.csv', index=False)
dxpy.upload_local_file('control_burden.csv', folder=OUTPUT_PATH, parents=True)
os.remove('control_burden.csv')

In [13]:
# Save deletion data
df=df[df.Case_Control=='16p12.1 deletion']
df=df[['Sample', 'yob', 'sex', 'age_sex',
               'All_coding_SNVs', 'All_coding_SNVs_LF', 'Missense', 'Missense_LF', 'LOF', 'LOF_LF', 'Splice', 'Splice_LF',
               'Genes_del', 'Genes_del_LF', 'Genes_dup', 'Genes_dup_LF']]
df.columns=['Sample', 'YOB', 'Sex', 'age_sex',
                'All coding SNVs', 'All coding SNVs (LF)', 'Missense', 'Missense (LF)', 'LOF', 'LOF (LF)', 'Splice', 'Splice (LF)',
                'Genes del.', 'Genes del. (LF)', 'Genes dup.', 'Genes dup. (LF)']

In [None]:
df.to_csv('16p_burden.csv', index=False)
dxpy.upload_local_file('16p_burden.csv', folder=OUTPUT_PATH, parents=True)
os.remove('16p_burden.csv')