## Goal: Get novel exonic regions by doing a reverse intersect of known exons w/ novel exons

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot
import math


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [30]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [31]:
df = pd.read_csv(proc_cfg(config['lr']['exon_info'], od), sep='\t')

df[['Chromosome', 'Strand', 'Start', 'End']] = df.eid.str.split('_', expand=True)
df = df.drop('transcript_id', axis=1)
df = df.drop_duplicates()
known_df = df.loc[df.novelty=='Known']
nov_df = df.loc[df.novelty!='Known']

known_df = pr.PyRanges(known_df)
nov_df = pr.PyRanges(nov_df)

In [38]:
# collapse all known exons so we don't end up with duplicated entries
# when we peform the overlap
known_df = known_df.merge(strand=True)
known_df = known_df.df
known_df = pr.PyRanges(known_df)
known_df.head()

Unnamed: 0,Chromosome,Start,End,Strand
0,GL000008.2,83859,84145,+
1,GL000008.2,85442,85477,+
2,GL000008.2,85566,85625,+
3,GL000008.2,88635,88695,+
4,GL000008.2,135133,135173,+
5,GL000008.2,155429,155531,+
6,GL000008.2,173515,173643,+
7,GL000008.2,163784,163998,-


In [39]:
known_df.Strand

0         +
1         +
2         +
3         +
4         +
         ..
141784    -
141785    -
141786    -
141787    -
141788    -
Name: Strand, Length: 141789, dtype: category
Categories (3, object): ['.' < '-' < '+']

In [40]:
nov_df.Strand

0        -
1        -
2        -
3        -
4        -
        ..
10366    +
10367    +
10368    +
10369    +
10370    -
Name: Strand, Length: 10371, dtype: category
Categories (3, object): ['.' < '-' < '+']

In [46]:
# get the reverse overlap of nov df wrt known df
# df = nov_df.join(known_df,
#                  how='left',
#                   strandedness='same',
#                 report_overlap=True)
# df.head()
df = nov_df.intersect(known_df,
                      strandedness='same',
                      invert=True)

In [47]:
df = df.df
df.loc[df.novelty=="Novel 5'/3'"].head()

Unnamed: 0,eid,novelty,Chromosome,Strand,Start,End
48,chr1_+_155738156_155738297,Novel 5'/3',chr1,+,155738156,155738297
277,chr1_-_28603527_28603574,Novel 5'/3',chr1,-,28603527,28603574
363,chr1_-_38864631_38864714,Novel 5'/3',chr1,-,38864631,38864714
371,chr1_-_202001010_202001308,Novel 5'/3',chr1,-,202001010,202001308
435,chr2_+_63589911_63590084,Novel 5'/3',chr2,+,63589911,63590084


In [44]:
df['len'] = df['End']-df['Start']

In [45]:
df.loc[df.duplicated(subset='eid', keep=False)].sort_values(by='eid').head()

Unnamed: 0,eid,novelty,Chromosome,Strand,Start,End,Start_b,End_b,Strand_b,Overlap,len
5517,chr10_+_102157018_102157328,Novel 5'/3',chr10,+,102157018,102157328,102157018,102157074,+,56,310
5518,chr10_+_102157018_102157328,Novel 5'/3',chr10,+,102157018,102157328,102157185,102157328,+,143,310
5498,chr10_+_12215739_12217467,Novel 5'/3',chr10,+,12215739,12217467,12217360,12217467,+,107,1728
5497,chr10_+_12215739_12217467,Novel 5'/3',chr10,+,12215739,12217467,12215739,12215835,+,96,1728
5521,chr10_+_60785663_60788230,Novel 5'/3',chr10,+,60785663,60788230,60785663,60785787,+,124,2567
