In [6]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *
from scripts.sm_utils import *

In [7]:
config = load_config()
od = '../../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

# Goal: get all exons that are detected in one population that aren't in the other

In [8]:
import pandas as pd
c_df = pd.read_csv('fst_files.txt', header=None, names=['contrast'])
c_df[['pop1','pop2']] = c_df.contrast.str.split('_', expand=True)
c_df.head()

Unnamed: 0,contrast,pop1,pop2
0,HAC_YRI,HAC,YRI
1,HAC_PEL,HAC,PEL
2,HAC_LWK,HAC,LWK
3,HAC_ITU,HAC,ITU
4,HAC_CEU,HAC,CEU


In [9]:
pop1 = 'HAC'
pop2 = 'YRI'

In [10]:
# get exon / transcript / novelty info from the table that
# already ran
df = pd.read_csv(proc_cfg(config['lr']['exon_info'],od), sep='\t')

In [11]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
meta = meta.loc[meta.mixed_samples == False]
pops = meta['population'].tolist()

mt_df = pd.read_csv('../../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# meta = load_meta()
# meta = meta.loc[meta.merged_run_mode==True]
# pops = 
_, pops = get_population_colors()
mt_df = mt_df[['isoform']+pops]
mt_df.rename({'isoform':'transcript_id'},
             inplace=True,
             axis=1)

# set index and just say t/f
mt_df = mt_df.set_index('transcript_id')
mt_df = mt_df>0
mt_df.reset_index(inplace=True)

In [12]:
# merge novelty info w/ detection info
df = df.merge(mt_df, 
              how='left',
              on='transcript_id')

In [13]:
# dedupe eids, take max. of each population to 
# see if it's det or not
df.drop('transcript_id', axis=1, inplace=True)
df = df.groupby(['eid', 'novelty']).max().reset_index()

In [18]:
# for each pair of populations, get the exons that are in them
df = df[['eid', 'novelty', pop1, pop2]]
df = df.loc[df[[pop1, pop2]].sum(axis=1)>=1]
assert len(df.loc[(df[pop1]==False)&(df[pop2]==False)]) == 0

In [19]:
df.head()


Unnamed: 0,eid,novelty,HAC,YRI
5,GL000008.2_+_83926_84145,Known,False,True
8,GL000008.2_+_85566_85625,Known,True,True
9,GL000008.2_+_88635_88695,Known,False,True
10,GL000008.2_-_163784_163998,Known,True,True
11,GL000008.2_-_164584_164884,Known,False,True


In [21]:
df[['Chromosome', 'Strand', 'Start', 'End']] = df.eid.str.split('_', expand=True)
df = pr.PyRanges(df)
df.to_bed('test.bed')

## novelty and tid info

In [None]:
# df = get_internal_exon_info(proc_cfg(config['lr']['gtf_filt_with_genes'], od),
#                             proc_cfg(config['ref']['gtf'], od))

## population detection parsing

In [4]:
# first, get all transcripts that are specific b/w 2 population
mt_df = pd.read_csv('../../data/05_mastertable/29102024_PODER_mastertable.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
print(len(mt_df.loc[mt_df.pop_spec_t==True, 'isoform'].unique()))

2267


In [13]:
t_df = mt_df[['isoform', 'AJI', 'CEU',
              'HAC', 'ITU', 'LWK',
              'MPC', 'PEL', 'YRI']]
t_df.head()

Unnamed: 0,isoform,AJI,CEU,HAC,ITU,LWK,MPC,PEL,YRI
0,ENST00000741425.1,1,0,0,0,0,0,0,0
1,transcript_0,1,0,0,0,0,0,0,0
2,transcript_1,1,0,0,0,0,0,1,0
3,transcript_10,1,1,0,1,0,0,0,0
4,transcript_11,2,1,0,1,0,0,2,0


Unnamed: 0.1,Unnamed: 0,Chromosome,Strand,Start,End,eid,Strand_gc,Start_gc,End_gc,eid_gc,Overlap
0,0,GL000008.2,+,83926,84014,GL000008.2_+_83926_84014_,+,83859,84014,GL000008.2_+_83859_84014_,88
1,1,GL000008.2,+,83926,84014,GL000008.2_+_83926_84014_,+,83926,84145,GL000008.2_+_83926_84145_,88
2,2,GL000008.2,+,83926,84014,GL000008.2_+_83926_84014_,+,83926,84014,GL000008.2_+_83926_84014_,88
3,3,GL000008.2,+,83926,84014,GL000008.2_+_83926_84014_,+,83952,84145,GL000008.2_+_83952_84145_,62
4,4,GL000008.2,+,85456,85477,GL000008.2_+_85456_85477_,+,85442,85477,GL000008.2_+_85442_85477_,21
