## Goal: look for exonic SNPs close to the SJs (+- 10 bp)

In [165]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [166]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [167]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df.columns
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID',
          'map_reads_assemblymap','lab_sample', 'population']].drop_duplicates()

temp_df = pd.read_csv('../snakemake/transcript_discovery_personal/cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

# TODO bad sample that hasn't finished on espresso
# bad_samples = ['NA19328']
# df = df.loc[~df.cell_line_id_1000g.isin(bad_samples)]

hap = ['hap1', 'hap2']

meta_df = df.copy(deep=True)

## We want to know what % of 1.5 NOVEL sjs that we 1. discover only in personal haplotypes that either have 2. noncanonical splice motifs in hg38 or 3. have snps +- 10 of splice junctions

In [169]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)
df['detected'] = True

In [170]:
# 1.5 only novel sjs
temp = df.loc[df.junction_novelty=='novel']

In [171]:
# get only the detection from mapping
temp = temp[['sj_id', 'cell_line_id', 'map_genome', 'detected']]
temp = temp.drop_duplicates()
temp = temp.pivot(index=['sj_id', 'cell_line_id'], 
                columns=['map_genome'],
                values=['detected'])

# flatten
temp.columns = temp.columns.get_level_values(1)
temp.columns.name = None

# reset index to make it a flat DataFrame
temp = temp.reset_index()

# fill missing values with False
temp = temp.fillna(False)

In [172]:
# 1. get those that are detected uniquely in hap1/2 
# (ie those not in hg38)
temp = temp.loc[(temp.hg38==False)&
                ((temp.hap1+temp.hap2)>0)]

In [173]:
# 2. get the hg38 canonicity of each of these sjs
temp2 = df[['sj_id', 'splice_motif', 'canonical', 'sqanti_genome']].drop_duplicates()
temp2 = temp2.loc[temp2.sqanti_genome=='hg38']
assert len(temp2.loc[temp2.sj_id.duplicated()]) == 0
temp = temp.merge(temp2, how='left', 
                  on='sj_id')

In [None]:
# 3. process snp content in each sj

## dev for geting sjs

In [109]:
# temp_back = temp.copy(deep=True)

In [174]:
temp = temp_back.copy(deep=True)

In [157]:
# # check w/ just 1 genome + cell line at first so I can 
# # dl GTF and check it
# temp = temp.loc[(temp.cell_line_id=='HG03732')&\
#                 (temp.hap1==True)]

map_genome,sj_id,junction_novelty,start_site_novelty,end_site_novelty,cell_line_id,hap1,hap2,hg38
0,GL000008.2_+_155532_156720,known,known,known,HG03732,True,True,True
3,GL000008.2_+_83546_83926,known,known,known,HG03732,True,True,True
7,GL000008.2_+_83546_85566,known,known,known,HG03732,True,True,True
11,GL000008.2_+_84015_85566,known,known,known,HG03732,True,True,True
18,GL000008.2_+_85626_129984,known,known,known,HG03732,True,True,True


In [176]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)

# remove sqanti genome and drop dupes
# the sqanti genome / sqanti metrics SHOULD be irrelevant here
df = df.drop(['sqanti_genome', 'canonical', 'splice_motif'], axis=1)
print(len(df.index))
df = df.drop_duplicates()
print(len(df.index))

# then make sure that there are no dupe. sj+sj nov+sample+map genome
temp = df.loc[df[['sj_id', 'junction_novelty',
                  'cell_line_id', 'map_genome', 'start_site_novelty',
                  'end_site_category']].duplicated(keep=False)]
assert len(temp.index) == 0
del temp

df.rename({'end_site_category': 'end_site_novelty'}, axis=1, inplace=True)

# transform to be t/f for each ic per genome
temp = pd.crosstab(index=[df.sj_id, df.junction_novelty,
                                 df.start_site_novelty,
                                 df.end_site_novelty, df.cell_line_id],
                          columns=df.map_genome,
                          values=df.map_genome,
                          aggfunc=lambda x: True).fillna(False).reset_index()

temp[['Chromosome', 'Strand', 'Start', 'End']] = temp.sj_id.str.split('_', expand=True)
temp = temp[['Chromosome', 'Strand', 'Start', 'End',
             'cell_line_id', 'sj_id']].drop_duplicates()

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand', 'cell_line_id', 'sj_id'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''
temp.loc[temp.map_genome=='Start', 'sj_loc'] = 'start'
temp.loc[temp.map_genome=='End', 'sj_loc'] = 'end'

temp.rename({'value':'Start'}, axis=1, inplace=True)
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# verified
temp['Start'] = temp.Start-2
temp['End'] = temp.Start+1

# add 10 to start, rm 10 from end
# verified https://trello.com/c/qzMAZpAm
temp.loc[temp.sj_loc=='start', 'Start'] = temp.loc[temp.sj_loc=='start', 'Start']-9
temp.loc[temp.sj_loc=='end', 'End'] = temp.loc[temp.sj_loc=='end', 'End']+14
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'Start']+2

temp.drop(['cell_line_id', 'map_genome'], axis=1, inplace=True)

24934518
8311506


In [179]:
temp.loc[temp.sj_id=='GL000008.2_+_155532_156720']

Unnamed: 0,Chromosome,Strand,cell_line_id,sj_id,map_genome,Start,sj_loc,End
0,GL000008.2,+,HG03732,GL000008.2_+_155532_156720,Start,155521,start,155531
2786681,GL000008.2,+,HG03732,GL000008.2_+_155532_156720,End,156720,end,156733


In [181]:
temp.head()


Unnamed: 0,Chromosome,Strand,sj_id,Start,sj_loc,End
0,GL000008.2,+,GL000008.2_+_155532_156720,155521,start,155531
1,GL000008.2,+,GL000008.2_+_83546_83926,83535,start,83545
2,GL000008.2,+,GL000008.2_+_83546_83926,83535,start,83545
3,GL000008.2,+,GL000008.2_+_83546_83926,83535,start,83545
4,GL000008.2,+,GL000008.2_+_83546_83926,83535,start,83545


Unnamed: 0,Chromosome,Strand,cell_line_id,map_genome,Start,sj_loc,End
0,GL000008.2,+,HG03732,Start,155521,start,155531
1,GL000008.2,+,HG03732,Start,83535,start,83545
2,GL000008.2,+,HG03732,Start,83535,start,83545
3,GL000008.2,+,HG03732,Start,84004,start,84014
4,GL000008.2,+,HG03732,Start,85615,start,85625
5,GL000008.2,+,HG03732,Start,85615,start,85625
6,GL000008.2,+,HG03732,Start,85615,start,85625
7,GL000008.2,+,HG03732,Start,85615,start,85625


In [164]:
# temp.to_bed('test.bed')
# temp.to_bed('test_2.bed')
