## GOAL: get 2bp bed regions for SSs

In [24]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [25]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [26]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df.columns
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID',
          'map_reads_assemblymap','lab_sample', 'population']].drop_duplicates()

temp_df = pd.read_csv('../snakemake/transcript_discovery_personal/cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

# TODO bad sample that hasn't finished on espresso
# bad_samples = ['NA19328']
# df = df.loc[~df.cell_line_id_1000g.isin(bad_samples)]

hap = ['hap1', 'hap2']

meta_df = df.copy(deep=True)

In [33]:
file = proc_cfg(config['lr']['td_personal']['sqanti']['sj_summary'],od)

df = pd.read_csv(file)

# df = pd.read_csv(input.sj_summary)
df.drop('Unnamed: 0', axis=1, inplace=True)

# remove sqanti genome and drop dupes
# the sqanti genome / sqanti metrics SHOULD be irrelevant here
df = df.drop(['sqanti_genome', 'canonical', 'splice_motif'], axis=1)
print(len(df.index))
df = df.drop_duplicates()
print(len(df.index))

# then make sure that there are no dupe. sj+sj nov+sample+map genome
temp = df.loc[df[['sj_id', 'junction_novelty',
                  'cell_line_id', 'map_genome', 'start_site_novelty',
                  'end_site_category']].duplicated(keep=False)]
assert len(temp.index) == 0
del temp

df.rename({'end_site_category': 'end_site_novelty'}, axis=1, inplace=True)

# transform to be t/f for each ic per genome
temp = pd.crosstab(index=[df.sj_id, df.junction_novelty,
                                 df.start_site_novelty,
                                 df.end_site_novelty, df.cell_line_id],
                          columns=df.map_genome,
                          values=df.map_genome,
                          aggfunc=lambda x: True).fillna(False).reset_index()


temp[['Chromosome', 'Strand', 'Start', 'End']] = temp.sj_id.str.split('_', expand=True)
temp = temp[['Chromosome', 'Strand', 'Start', 'End', 'sj_id']].drop_duplicates()

temp.Start = temp.Start.astype(int)
temp.End = temp.End.astype(int)
assert len(temp.loc[temp.Start>temp.End])==0

# melt to 5' and 3'
temp = temp.melt(id_vars=['Chromosome', 'Strand', 'sj_id'],
                 value_vars=['Start', 'End'])
temp['sj_loc'] = ''
temp.loc[temp.map_genome=='Start', 'sj_loc'] = 'start'
temp.loc[temp.map_genome=='End', 'sj_loc'] = 'end'

temp.rename({'value':'Start'}, axis=1, inplace=True)
# need to verify that this is working using like one motif or something make
# sure I don't have off-by-one errors
# temp['Start'] = temp.Start - 2
temp['End'] = 0  # This ensures a 2bp region

# Adjust for splice junction locations
#verified
temp.loc[temp.sj_loc == 'start', 'Start'] = temp.loc[temp.sj_loc == 'start', 'Start'] -1
temp.loc[temp.sj_loc == 'start', 'End'] = temp.loc[temp.sj_loc == 'start', 'Start'] + 2

temp.loc[temp.sj_loc == 'end', 'End'] = temp.loc[temp.sj_loc == 'end', 'Start']
temp.loc[temp.sj_loc=='end', 'Start'] = temp.loc[temp.sj_loc=='end', 'End'] - 2

temp.drop(['map_genome'], axis=1, inplace=True)

temp['len'] = temp.End-temp.Start
assert len(temp.loc[temp.len!=2]) == 0

temp = pr.PyRanges(temp)
temp.to_bed(output.bed)

24934518
8311506


In [34]:
temp.tail()

Unnamed: 0,Chromosome,Strand,sj_id,Start,sj_loc,End
0,chrY,-,chrY_-_3187916_3218694,3218692,end,3218694
1,chrY,-,chrY_-_3188385_3219255,3219253,end,3219255
2,chrY,-,chrY_-_3220533_3220633,3220631,end,3220633
3,chrY,-,chrY_-_334518_338603,338601,end,338603
4,chrY,-,chrY_-_338711_338777,338775,end,338777
5,chrY,-,chrY_-_338897_340764,340762,end,340764
6,chrY,-,chrY_-_340941_341306,341304,end,341306
7,chrY,-,chrY_-_7067886_7076386,7076384,end,7076386


In [35]:
temp.head()

Unnamed: 0,Chromosome,Strand,sj_id,Start,sj_loc,End
0,GL000008.2,+,GL000008.2_+_155532_156720,155531,start,155533
1,GL000008.2,+,GL000008.2_+_83546_83926,83545,start,83547
2,GL000008.2,+,GL000008.2_+_83546_85566,83545,start,83547
3,GL000008.2,+,GL000008.2_+_84015_85456,84014,start,84016
4,GL000008.2,+,GL000008.2_+_84015_85566,84014,start,84016
5,GL000008.2,+,GL000008.2_+_84146_85566,84145,start,84147
6,GL000008.2,+,GL000008.2_+_85478_85566,85477,start,85479
7,GL000008.2,+,GL000008.2_+_85626_129984,85625,start,85627


In [36]:
temp.to_bed('test.bed')