In [1]:
import pandas as pd

import scipy.stats as stats

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

import dxpy
import os

In [None]:
# Input and output files
BURDEN_16P="/path/to/16p12.1/sample/burden/data.csv" # Use the output of script 3_identify_samples.ipynb
BURDEN_CONTROL="/path/to/control/sample/burden/data.csv" # Use the output of script 3_identify_samples.ipynb

OUTPUT_DIR="/path/to/output/directory"
# Output will be (1) histograms of the burden in 16p12.1 deletion samples and controls as shown in Figure 6C
# (2) a file with the t-test statistics, as shown in Table S5D

In [None]:
# Compare burden between 16p12.1 deletion and control samples
df=pd.read_csv(BURDEN_16P)
df['Case_Control']='16p12.1 deletion'
contdf=pd.read_csv(BURDEN_CONTROL)
contdf['Case_Control']='No CNV Control'

df=pd.concat([df, contdf])

In [3]:
# Remove splice variants from All coding SNVs
df['All coding SNVs']=df['All coding SNVs']-df.Splice
df['All coding SNVs (LF)']=df['All coding SNVs (LF)']-df['Splice (LF)']

In [4]:
pdf=PdfPages('2_Fig5B.pdf')
burden_cols=['All coding SNVs', 'Missense', 'LOF', 'All coding SNVs (LF)', 'Missense (LF)', 'LOF (LF)']
stat_lst=[]

for col in burden_cols:
    subdf=df[~df[col].isnull()]
    # Plot
    fig, ax=plt.subplots()
    sns.histplot(data=subdf, x=col, hue='Case_Control', hue_order=['16p12.1 deletion', 'No CNV Control'], palette=['#795C8D', '#AFAFAF'], kde=True, common_norm=False, stat='proportion', discrete=True)

    # Run t-test
    del_16p=subdf[subdf.Case_Control=='16p12.1 deletion'][col].to_numpy()
    control=subdf[subdf.Case_Control=='No CNV Control'][col].to_numpy()

    res=stats.ttest_ind(del_16p, control, alternative='less')
    stat_lst.append([col, 'One-tailed T test', len(del_16p), len(control), sum(del_16p)/len(del_16p), sum(control)/len(control), res.statistic, res.pvalue])

    mean_16p=sum(del_16p)/len(del_16p)
    mean_control=sum(control)/len(control)
    
    ax.axvline(mean_16p, color='#795C8D', ls='--', zorder=0)
    ax.axvline(mean_control, color='#AFAFAF', ls='--', zorder=0)

    lo, hi=plt.ylim()

    text='p=%.3f' % res.pvalue
    plt.text(mean_control, hi, text, ha='left', va='bottom')

    plt.ylim(lo, hi*1.1)

    plt.title(col)
    pdf.savefig()
    plt.close()
pdf.close()

# Save results in frame
stat_df=pd.DataFrame(stat_lst, columns=['Variable', 'Test', '16p12.1 del. n', 'Control n', '16p12.1 del. mean', 'Control mean', 'statistic', 'p value'])

In [None]:
# Save results as CSV
stat_df.to_csv('control_burden_comparison.csv', index=False)
dxpy.upload_local_file('control_burden_comparison.csv', folder=OUTPUT_DIR, parents=True)
os.remove('control_burden_comparison.csv')

In [None]:
# Save PDF
dxpy.upload_local_file('Fig6C.pdf', folder=OUTPUT_DIR, parents=True)
os.remove('Fig6C.pdf')