## Goal: get population-specific exons adhering to the same criteria as our population-specific transcripts
* Discovered in >= 2 samples in only one population

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot
import math


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [12]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [13]:
exon_info_file = proc_cfg(config['lr']['exon_info'], od)
mt_file = proc_cfg(config['lr']['mt'], od)
mt_file

'../data/05_mastertable/29102024_PODER_mastertable.tsv'

In [15]:
def get_pop_spec_exons(exon_info_file,
                       mt_file):
    # get exon / transcript / novelty info from the table that
    # already ran
    df = pd.read_csv(exon_info_file, sep='\t')

    meta = load_meta()
    meta = meta.loc[meta.merged_run_mode==True]
    meta = meta.loc[meta.mixed_samples == False]
    samples = meta['sample'].tolist()

    mt_df = pd.read_csv(mt_file, sep='\t')
    mt_df = mt_df[['isoform']+samples]
    mt_df.rename({'isoform':'transcript_id'},
                 inplace=True,
                 axis=1)

    # merge novelty info w/ detection info
    df = df.merge(mt_df,
                  how='left',
                  on='transcript_id')

    # dedupe eids, take max. of each population to
    # see if it's det or not
    df.drop('transcript_id', axis=1, inplace=True)
    df = df.groupby(['eid', 'novelty']).max().reset_index()

    # define population specificity 
    for pop in meta.population.unique().tolist():
        df[pop] = df[meta.loc[meta.population==pop,
                              'sample'].tolist()].sum(axis=1)

    df = df.set_index(['eid', 'novelty'])
    df = df.drop(samples, axis=1)
    df['n_pop'] = (df>0).sum(axis=1)
    df['pop_spec'] = (df.n_pop==1)&(df[pops]>=2).any(axis=1)
    df['pop_spec_pop'] = df[pops].idxmax(axis=1)

    print(len(df.loc[df.pop_spec==True]))

    df = df.reset_index()
    df[['Chromosome', 'Strand', 'Start', 'End']] = df.eid.str.split('_', expand=True)
    df = df[['Chromosome', 'Strand', 'Start',
             'End', 'novelty', 'eid', 'pop_spec', 'pop_spec_pop']]

    return df

Unnamed: 0,eid,novelty,ITU5,ITU4,ITU3,ITU2,ITU1,PEL6,PEL5,PEL4,...,YRI1,CEU5,CEU4,CEU3,CEU2,CEU1,MPC4,MPC3,MPC2,MPC1
0,GL000008.2_+_135133_135173,Known,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,GL000008.2_+_155429_155531,Known,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,GL000008.2_+_173515_173643,Known,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,GL000008.2_+_83859_84014,Known,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,GL000008.2_+_83926_84014,Known,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


Unnamed: 0_level_0,Unnamed: 1_level_0,ITU,PEL,HAC,AJI,LWK,YRI,CEU,MPC,n_pop,pop_spec,pop_spec_pop
eid,novelty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GL000221.1_-_79227_79247,Known,0,0,0,0,0,2,0,0,1,True,YRI
chr10_+_102396448_102396823,Known,0,0,0,2,0,0,0,0,1,True,AJI
chr10_+_102470664_102471117,Known,0,0,0,2,0,0,0,0,1,True,AJI
chr10_+_102472853_102472950,Known,0,0,0,2,0,0,0,0,1,True,AJI
chr10_+_102473569_102473741,Known,0,0,0,2,0,0,0,0,1,True,AJI
...,...,...,...,...,...,...,...,...,...,...,...,...
chrX_-_73822070_73822216,Known,0,0,0,0,0,3,0,0,1,True,YRI
chrX_-_73822070_73822233,Known,0,0,0,0,0,2,0,0,1,True,YRI
chrY_+_3102084_3102194,Known,0,0,2,0,0,0,0,0,1,True,HAC
chrY_+_57208842_57209354,Known,0,0,0,2,0,0,0,0,1,True,AJI


169847