In [12]:
import pandas as pd
from scipy.stats import binom_test
from statsmodels.stats.multitest import multipletests


In [13]:
pheno = pd.read_csv('../data/pheno_final.tsv', sep='\t')
pheno = pheno.drop_duplicates('subject')
pheno = pheno.set_index('subject', drop=False)

mapp = pd.read_csv('../data/gene_names_mapping_new.tsv', sep='\t')
mapp = mapp.set_index('ensembl', drop=False)

pmapp = mapp[mapp.biotype == 'protein_coding']

rel = pd.read_csv('../data/family_summaries.tsv', sep='\t')
rel = rel.set_index('subject', drop=False)

In [14]:
ase = pd.DataFrame()

for sub in pheno.index:
    app = pd.read_csv('../../ase_no_replicates/output/{}.gene_ae.txt'.format(sub), sep='\t') 
    app['subject'] = sub
    ase = ase.append(app)
    
ase = ase.reset_index(drop=True)

In [15]:
# add ensembl name
ase['ensembl'] = ase.name.apply(lambda s: s.split('.')[0])

In [16]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6119105/
# Only variants with at least 10 reads were used.

ase = ase[ase.totalCount > 10].copy()

In [17]:
# binomial test

ase['p'] = ase.apply(lambda row : binom_test(row.aCount, row.totalCount, p=0.5), axis=1)

In [18]:
# FDR correction for each sample
ase['FDR'] = '.'
for sub in ase.subject.unique():
    subase = ase[ase.subject == sub]
    subindex = subase.index
    fdrs = multipletests(subase['p'], method='fdr_bh')[1]
    ase.loc[subindex, 'FDR'] = fdrs


In [20]:
# ase.to_csv('ase_all.tsv', sep='\t', index=False)

In [8]:
# significance filter

ase_sig = ase[ase.FDR < 0.05]

In [9]:
# protein coding only

ase_sig = ase_sig[ase_sig.ensembl.isin(pmapp.ensembl.to_list())].copy()

In [10]:
# add gene name

ase_sig['gene'] = mapp.loc[ase_sig.ensembl.to_list(), 'Description'].to_list()

In [11]:
ase_sig.to_csv('ase.tsv', sep='\t', index=False)