In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

## We want to get the dist. of distances between novel SSs and annotated SSs.

Hopefully they are generally close


In [28]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df.columns
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID',
          'map_reads_assemblymap','lab_sample', 'population']].drop_duplicates()

temp_df = pd.read_csv('../snakemake/transcript_discovery_personal/cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

# TODO bad sample that hasn't finished on espresso
# bad_samples = ['NA19328']
# df = df.loc[~df.cell_line_id_1000g.isin(bad_samples)]

hap = ['hap1', 'hap2']

meta_df = df.copy(deep=True)

In [70]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)
df['detected'] = True

# 1.5 only novel sss
temp = df.loc[(df.start_site_novelty=='novel')|(df.end_site_category=='novel')]

# only keep unique novel sjs
temp[['Chromosome', 'Strand', 'Start', 'End']] = temp.sj_id.str.split('_', expand=True)
temp = temp[['Chromosome', 'Strand', 'Start', 'End', 'sj_id', 
             'start_site_novelty', 'end_site_category', 'cell_line_id']].drop_duplicates()

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand', 'sj_id',
                          'start_site_novelty', 'end_site_category', 'cell_line_id'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''


temp.loc[temp.variable=='Start', 'sj_loc'] = 'start'
temp.loc[temp.variable=='End', 'sj_loc'] = 'end'

temp['novelty_category'] = ''
temp.loc[temp.variable=='Start', 'novelty_category'] = temp.loc[temp.variable=='Start', 'start_site_novelty']
temp.loc[temp.variable=='End', 'novelty_category'] = temp.loc[temp.variable=='End', 'end_site_category']

# only get the novel ones now
temp = temp.loc[temp.novelty_category=='novel']

temp.rename({'value':'Start'}, axis=1, inplace=True)
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# verified
temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# verified
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']-0
temp.loc[temp.sj_loc=='start', 'End'] = temp.loc[temp.sj_loc=='start', 'End']+0


temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+2
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2


In [73]:
nov_ss_df = temp.copy(deep=True)
nov_ss_df.head()

Unnamed: 0,Chromosome,Strand,sj_id,start_site_novelty,end_site_category,cell_line_id,variable,Start,sj_loc,novelty_category,End
0,GL000195.1,-,GL000195.1_-_49120_86517,novel,known,NA12273,Start,49118,start,novel,49119
1,chr17,-,chr17_-_1821508_1827860,novel,known,NA12273,Start,1821506,start,novel,1821507
2,chr17,-,chr17_-_1821512_1827860,novel,known,NA12273,Start,1821510,start,novel,1821511
3,chr17,-,chr17_-_1823205_1827860,novel,known,NA12273,Start,1823203,start,novel,1823204
4,chr17,-,chr17_-_1825231_1827860,novel,known,NA12273,Start,1825229,start,novel,1825230


In [62]:
# temp = pr.PyRanges(temp)
# temp.to_bed('test_novel_ss.bed')

In [None]:
# get annotated (v47) sss
gtf_file = proc_cfg(config['ref']['gtf'], od)

# read in ref. gtf and get SJs from there
temp = pr.read_gtf(gtf_file)

# get the introns
temp = temp.features.introns()
temp = temp.df

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''
temp.loc[temp.variable=='Start', 'sj_loc'] = 'start'
temp.loc[temp.variable=='End', 'sj_loc'] = 'end'

temp.rename({'value':'Start'}, axis=1, inplace=True)

temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# need to verify
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']-0
temp.loc[temp.sj_loc=='start', 'End'] = temp.loc[temp.sj_loc=='start', 'End']+0


temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+2
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2


temp = pr.PyRanges(temp)

temp.to_bed(output.bed)
