In [7]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [8]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

## We want to get the dist. of distances between novel SSs and annotated SSs.

Hopefully they are generally close


In [9]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df.columns
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID',
          'map_reads_assemblymap','lab_sample', 'population']].drop_duplicates()

temp_df = pd.read_csv('../snakemake/transcript_discovery_personal/cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

# TODO bad sample that hasn't finished on espresso
# bad_samples = ['NA19328']
# df = df.loc[~df.cell_line_id_1000g.isin(bad_samples)]

hap = ['hap1', 'hap2']

meta_df = df.copy(deep=True)

In [10]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)
df['detected'] = True

# 1.5 only novel sss
temp = df.loc[(df.start_site_novelty=='novel')|(df.end_site_category=='novel')]

# only keep unique novel sjs
temp[['Chromosome', 'Strand', 'Start', 'End']] = temp.sj_id.str.split('_', expand=True)
temp = temp[['Chromosome', 'Strand', 'Start', 'End', 'sj_id', 
             'start_site_novelty', 'end_site_category', 'cell_line_id']].drop_duplicates()

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand', 'sj_id',
                          'start_site_novelty', 'end_site_category', 'cell_line_id'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''


temp.loc[temp.variable=='Start', 'sj_loc'] = 'start'
temp.loc[temp.variable=='End', 'sj_loc'] = 'end'

temp['novelty_category'] = ''
temp.loc[temp.variable=='Start', 'novelty_category'] = temp.loc[temp.variable=='Start', 'start_site_novelty']
temp.loc[temp.variable=='End', 'novelty_category'] = temp.loc[temp.variable=='End', 'end_site_category']

# only get the novel ones now
temp = temp.loc[temp.novelty_category=='novel']

temp.rename({'value':'Start'}, axis=1, inplace=True)
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# verified
temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# verified
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']-0
temp.loc[temp.sj_loc=='start', 'End'] = temp.loc[temp.sj_loc=='start', 'End']+0


temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+2
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2

# nov_ss_df = temp.copy(deep=True)
# nov_ss_df.head()

In [11]:
nov_ss_df = temp.copy(deep=True)
nov_ss_df = pr.PyRanges(nov_ss_df)

In [14]:
nov_ss_df = nov_ss_df[['Chromosome', 'Strand', ' Start', 'End']]
nov_ss_df.to_bed('person_hap_nov_ss.bed')


## Get annotated (v47) SSs

In [None]:
# get annotated (v47) sss
gtf_file = proc_cfg(config['ref']['gtf'], od)

# read in ref. gtf and get SJs from there
temp = pr.read_gtf(gtf_file)

# get the introns
temp = temp.features.introns()
temp = temp.df

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''
temp.loc[temp.variable=='Start', 'sj_loc'] = 'start'
temp.loc[temp.variable=='End', 'sj_loc'] = 'end'

temp.rename({'value':'Start'}, axis=1, inplace=True)

temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# need to verify
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']+1
temp.loc[temp.sj_loc=='start', 'End'] = temp.loc[temp.sj_loc=='start', 'End']+1


temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+2
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2


temp = pr.PyRanges(temp)



In [79]:
temp.to_bed('test_v47_ss.bed')


## Try specifically subsetting on the SJs that are exclusively found in hap1/hap2?

In [15]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)

In [16]:
# dedupe w/ sqanti genome cause no one cares
df = df.drop(['sqanti_genome', 'splice_motif', 'canonical'], axis=1)
print(len(df.index))
df = df.drop_duplicates()
print(len(df.index))

24934518
8311506


In [17]:
# # try just w/ one cell line for now
# df = df.loc[df.cell_line_id=='HG03732']
# df_back = df.copy(deep=True)

In [18]:
# df = df_back.copy(deep=True)
# df.head()

In [19]:
# 1. filter for sjs w/ at least one novel ss 
df = df.loc[(df.start_site_novelty=='novel')|(df.end_site_category=='novel')]

In [20]:
# melt to 5' and 3'
df[['Chromosome', 'Strand', 'Start', 'End']] = df.sj_id.str.split('_', expand=True)

df = df.melt(id_vars=['Chromosome', 'Strand', 'map_genome',
                      'start_site_novelty', 'end_site_category', 'cell_line_id'],
                 value_vars=['Start', 'End'])
df['sj_loc'] = ''
df['detected'] = True


df.loc[df.variable=='Start', 'sj_loc'] = 'start'
df.loc[df.variable=='End', 'sj_loc'] = 'end'

df['novelty_category'] = ''
df.loc[df.variable=='Start', 'novelty_category'] = df.loc[df.variable=='Start', 'start_site_novelty']
df.loc[df.variable=='End', 'novelty_category'] = df.loc[df.variable=='End', 'end_site_category']

# only get the novel ones now
df = df.loc[df.novelty_category=='novel']

df.rename({'value':'Start'}, axis=1, inplace=True)

# create id for each ss
df['ss_id'] = 'ss_'+\
                df.sj_loc+'_'+\
                df.Chromosome+'_'+\
                df.Strand+'_'+\
                df.Start.astype(str)

df = df[['ss_id', 'cell_line_id', 'map_genome', 'detected']]

# check for sss that originated from different sjs
assert len(df.loc[df.ss_id.duplicated()]) > 0

# then dedupe and pivot
df = df.drop_duplicates()
df = df.pivot(index=['ss_id', 'cell_line_id'], 
                columns=['map_genome'],
                values=['detected'])

# flatten
df.columns = df.columns.get_level_values(1)
df.columns.name = None

# reset index to make it a flat DataFrame
df = df.reset_index()

# fill missing values with False
df = df.fillna(False)

In [21]:
# 1. get those that are detected uniquely in hap1/2 
# (ie those not in hg38)
df = df.loc[(df.hg38==False)&
                ((df.hap1+df.hap2)>0)]

In [22]:
df[['nothing', 'sj_loc', 'Chromosome', 'Strand', 'Start']] = df.ss_id.str.split('_', expand=True)
df.drop('nothing', axis=1, inplace=True)

Unnamed: 0,ss_id,cell_line_id,hap1,hap2,hg38,sj_loc,Chromosome,Strand,Start
92,ss_end_KI270726.1_+_26229,HG02261,True,False,False,end,KI270726.1,+,26229
128,ss_end_chr10_+_101365093,NA12778,True,False,False,end,chr10,+,101365093
129,ss_end_chr10_+_101374347,HG03729,True,False,False,end,chr10,+,101374347
130,ss_end_chr10_+_101374347,HG03732,False,True,False,end,chr10,+,101374347
131,ss_end_chr10_+_101374347,NA12778,True,False,False,end,chr10,+,101374347


In [23]:
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# verified
df['Start'] = df.Start.astype(int)-2
df['End'] = df.Start.astype(int)+1

# verified
df.loc[df.sj_loc=='start', 'Start'] = df.loc[df.sj_loc=='start', 'Start']-0
df.loc[df.sj_loc=='start', 'End'] = df.loc[df.sj_loc=='start', 'End']+0


df.loc[df.sj_loc=='end', 'End'] = df.loc[df.sj_loc=='end', 'End']+2
df.loc[df.sj_loc=='end', 'Start'] = df.loc[df.sj_loc=='end', 'Start']+2

nov_ss_df = df.copy(deep=True)
nov_ss_df.head()

Unnamed: 0,ss_id,cell_line_id,hap1,hap2,hg38,sj_loc,Chromosome,Strand,Start,End
92,ss_end_KI270726.1_+_26229,HG02261,True,False,False,end,KI270726.1,+,26229,26230
128,ss_end_chr10_+_101365093,NA12778,True,False,False,end,chr10,+,101365093,101365094
129,ss_end_chr10_+_101374347,HG03729,True,False,False,end,chr10,+,101374347,101374348
130,ss_end_chr10_+_101374347,HG03732,False,True,False,end,chr10,+,101374347,101374348
131,ss_end_chr10_+_101374347,NA12778,True,False,False,end,chr10,+,101374347,101374348


In [24]:
nov_ss_df = pr.PyRanges(nov_ss_df)
nov_ss_df.to_bed('person_hap_nov_ss_hg38_absent.bed')

## Ok now try w/ PODER

In [355]:
file = proc_cfg(config['lr']['sjs'],od)
df = pd.read_csv(file, sep='\t')
# df.drop('Unnamed: 0', axis=1, inplace=True)
df.rename({'start_site_category':'start_site_novelty'}, axis=1, inplace=True)
df['detected'] = True

df['sj_id'] = df['chrom'].astype(str) + '_' +\
              df['strand'].astype(str) + '_' +\
              df['genomic_start_coord'].astype(str) + '_' +\
              df['genomic_end_coord'].astype(str)

# 1.5 only novel sss
temp = df.loc[(df.start_site_novelty=='novel')|(df.end_site_category=='novel')]

# only keep unique novel sjs
temp[['Chromosome', 'Strand', 'Start', 'End']] = temp.sj_id.str.split('_', expand=True)
print(len(temp.index))
temp = temp[['Chromosome', 'Strand', 'Start', 'End', 'sj_id', 
             'start_site_novelty', 'end_site_category']].drop_duplicates()
print(len(temp.index))

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand', 'sj_id',
                          'start_site_novelty', 'end_site_category'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''


temp.loc[temp.variable=='Start', 'sj_loc'] = 'start'
temp.loc[temp.variable=='End', 'sj_loc'] = 'end'

temp['novelty_category'] = ''
temp.loc[temp.variable=='Start', 'novelty_category'] = temp.loc[temp.variable=='Start', 'start_site_novelty']
temp.loc[temp.variable=='End', 'novelty_category'] = temp.loc[temp.variable=='End', 'end_site_category']

# only get the novel ones now
temp = temp.loc[temp.novelty_category=='novel']

temp.rename({'value':'Start'}, axis=1, inplace=True)
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# verified
temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# verified
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']-0
temp.loc[temp.sj_loc=='start', 'End'] = temp.loc[temp.sj_loc=='start', 'End']+0


temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+2
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2

nov_ss_df = temp.copy(deep=True)
nov_ss_df = pr.PyRanges(nov_ss_df)

28544
23244


In [373]:
nov_ss_df.to_bed('poder_nov_ss.bed')