In [68]:
import pandas as pd

import vcf

In [69]:
pheno = pd.read_csv('../data/pheno_final.tsv', sep='\t')
pheno = pheno.set_index('subject', drop=False)

In [70]:
ase = pd.read_csv('ase.tsv', sep='\t')

In [71]:
pvs = pd.read_csv('ASE rare CADD variants.csv')

In [72]:
# mark rare variants as ASE or not ASE

pvs['ASE'] = 0
for i, row in pvs.iterrows():
    gene = row['Gene.refGene']
    sub = row['Sample']
    
    if ase[(ase.subject == sub) & (ase.gene == gene)].shape[0] > 0:
        pvs.at[i, 'ASE'] = 1

In [73]:
# filter out variants without ASE

pvs = pvs[pvs['ASE'] == 1].copy()

In [74]:
def flatten_haps(haps):
    haps_by_variants = []
    
    for j, row in haps.iterrows():
        variants = row['variants']
        haplotypeA = row['haplotypeA']
        haplotypeB = row['haplotypeB']
        aCount = row['aCount']
        bCount = row['bCount']
        blockGWPhase = row['blockGWPhase']
        gwStat = row['gwStat']
        
        variants = variants.split(',')
        haplotypeA = haplotypeA.split(',')
        haplotypeB = haplotypeB.split(',')
        for k in range(len(variants)):
            haps_by_variants.append([j, variants[k], haplotypeA[k], haplotypeB[k],
                                    aCount, bCount, blockGWPhase, gwStat])            


            
    cols = ['index', 'variant', 'haplotypeA', 'haplotypeB', 'aCount',
            'bCount', 'blockGWPhase', 'gwStat']
    haps_by_variants = pd.DataFrame(haps_by_variants, columns=cols)
    haps_by_variants['chrom'] = haps_by_variants.variant.apply(lambda s: s.split('_')[0])
    haps_by_variants['pos'] = haps_by_variants.variant.apply(lambda s: int(s.split('_')[1]))
    haps_by_variants = haps_by_variants.set_index('pos', drop=False)
    return haps_by_variants

In [75]:
pvs['ASE'] = '.'

for i, row in pvs.iterrows():
    sub = row['Sample']
    chrom = row['Chr']
    chrom = chrom[3:]
    pos = row['Pos']
    allele = row['Alt']
    ref_allele = row['Ref']

    # load GENE AE hap file
    haps = pd.read_csv('../../ase_no_replicates/output/{}.haplotypic_counts.txt'.format(sub), sep='\t')
    haps.contig = haps.contig.astype(str)
        
    # keep only nearby region
    haps = haps[haps.contig == chrom]
    haps = haps[haps.start > pos-100000]
    haps = haps[haps.stop  < pos+100000]
    if haps.shape[0] == 0:
        pvs.at[i, 'ASE'] = 'not_in_geneae'
        continue
        
    # split by variant
    haps_by_variants = flatten_haps(haps)
                
    if pos not in haps_by_variants.pos:
        pvs.at[i, 'ASE'] = 'not_in_geneae'
        continue
                        
    hap_var = haps_by_variants.loc[pos]
    aCount = hap_var['aCount']
    bCount = hap_var['bCount']
    if aCount == bCount:
        pvs.at[i, 'ASE'] = 'same_allelic_counts'
        continue

    if aCount > bCount:
        overexp_allele = hap_var['haplotypeA']
    if bCount > aCount:
        overexp_allele = hap_var['haplotypeB']

    if overexp_allele == allele:
        pvs.at[i, 'ASE'] = 'on_overexpressed_haplo'
        continue
    else:
        pvs.at[i, 'ASE'] = 'on_underexpressed_haplo'
        continue
                
        



In [76]:
# save file

pvs.to_csv('cadd.tsv', sep='\t', index=False)

In [77]:
pvs.ASE.value_counts()

on_underexpressed_haplo    122
not_in_geneae               55
on_overexpressed_haplo      25
Name: ASE, dtype: int64

In [78]:
pvs.shape

(202, 14)

In [None]:
# keep only those on overexpressed haplotype

cadd = pvs[pvs['ASE'] == 'on_overexpressed_haplo']

In [None]:
# add annotations to ase

ase['cadd_greater_than_25'] = ''

for i, row in cadd.iterrows():
    sub = row['Sample']
    gene = row['Gene.refGene']
    
    subase = ase[(ase.subject == sub) & (ase.gene == gene)]
    index= subase.index[0]
    ase.at[index, 'cadd_greater_than_25'] = 'X'

In [None]:
# save file

ase.to_csv('ase_cadd_annotated.tsv', sep='\t', index=False)