In [1]:
import pandas as pd
from scipy.stats import binom_test


In [2]:
pheno = pd.read_csv('../data/pheno_final.tsv', sep='\t')
pheno = pheno.drop_duplicates('subject')
pheno = pheno.set_index('subject', drop=False)

rel = pd.read_csv('../data/family_summaries.tsv', sep='\t')
rel = rel.set_index('subject', drop=False)

In [3]:
ase = pd.read_csv('ase_cadd_annotated.tsv', sep='\t')

# inherited or de novo

In [4]:
affected_child = ['SG001', 'SG011', 'SG021', 'SG026', 'SG155', 'SG041', 'SG042', 'SG037', 'SG148', 'SG022']
carrier_parents = ['SG006', 'SG025', 'SG069', 'SG040', 'SG044', 'SG151']
noncarrier_parents = ['SG007', 'SG024', 'SG027', 'SG039', 'SG043','SG152']

df = pd.DataFrame(index = pheno.index)
df['subject'] = df.index.to_series()
df['family'] = pheno.loc[df.subject, 'family'].to_list()
df['ase_count'] = 0
df['ase_de_novo'] ='.'
df['ase_inh_both'] = '.'
df['ase_inh_cp'] = '.'
df['ase_inh_ncp'] = '.'
df['ase_deletrious_variant_overexpressed'] = 0

for child in pheno.index:    
    c_ase = ase[ase.subject == child].ensembl.to_list()
    c_ase_cadd = ase[(ase.subject == child) & (ase.cadd_greater_than_25 == 'X')].ensembl.to_list()
    
    df.at[child, 'ase_count'] = len(c_ase)
    df.at[child, 'ase_deletrious_variant_overexpressed'] = len(c_ase_cadd)
    
    if child not in rel.index:
        continue
    
    cp = rel.at[child, 'carrier_parent']
    ncp = rel.at[child, 'non_carrier_parent']
    cp_ase = ase[ase.subject == cp].ensembl.to_list()
    ncp_ase = ase[ase.subject == ncp].ensembl.to_list()
    
    denovo = []
    icp = []
    incp = []
    iboth = []
    for gene in c_ase:
        if (gene in cp_ase) and (gene in ncp_ase):
            iboth.append(gene)
        elif gene in cp_ase:
            icp.append(gene)
        elif gene in ncp_ase:
            incp.append(gene)
        else:
            denovo.append(gene)
            
    df.at[child, 'ase_de_novo'] = len(denovo)
    df.at[child, 'ase_inh_both'] = len(iboth)
    df.at[child, 'ase_inh_cp'] = len(icp)
    df.at[child, 'ase_inh_ncp'] = len(incp)
    


In [5]:

df.to_csv('ase_summary.tsv', sep='\t', index=False)