In [None]:


# Options:
# Get Refseq ID <-> is_C/D_box_snoRNA link
# Go through sams/refseq/
# Or, using only the consensus:
# Load the deletion bedgraphs.
# Get (Refseq ID or genomic interval) <-> is_C/D_box_snoRNA link
# Count in regions using bedgraph.

# Sequences, with a single ID from Refseq, are in:
# /opt/indexes/refseq_index/refseq_rna_reformatted.fasta
# The counts file should have the same Refseq IDs as the fasta.
# These can be extracted first, since it would be a slow step, and saved.
# The GTF can then be used to look up genomic coordinates, which can also be saved.

# Finally, the bedgraphs are read and mapped to snoRNAs from their genomic coordinates,
# and the sequences are used to partition. That data is sent here for plotting.



In [None]:
import re, pandas, HTSeq, glob, collections, importlib
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

import sameRiver
from sameRiver import snoUtils
from sameRiver.signal_normalizer import signal_normalizer
from sameRiver.bedgraphs import bedgraphs
from sameRiver.area import *
importlib.reload(sameRiver.area)

"""Text below is copied from: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4053766/
Insights into snoRNA biogenesis and processing from PAR-CLIP of snoRNA core proteins and small RNA sequencing

Curated C/D and H/ACA box snoRNA annotation
SnoRNA sequences were initially obtained from snoRNA­LBME­db. BLAT searches at the UCSC Genome Browser 
(http://genome.ucsc.edu/cgi­bin/hgBlat) were then conducted to map sequences to the human genome assembly hg19.
Based on the data available at snoRNA­LBME­db and inspection of evolutionary conservation profiles at the 
UCSC Genome Browser, we next annotated C, C’, D’ and D box motifs, and the terminal closing stem. For some 
snoRNAs, terminal end complementarity could be extended beyond the initial sequence obtained from snoRNA LBME db. 
In these cases sequence coordinates were adjusted accordingly.

Each individual snoRNA is represented by 4 lines, which correspond to the description of the snoRNA, 
the sequence, annotation of C (C), D (D), C'(c) and D'(d) boxes, and the location of the terminal stem region in
parenthesis notation. The description line contains the snoRNA name, its coordinates (chromosome, start, stop and 
strand) in the hg19 version of the human genome assembly from the University of California Santa Cruz, and the 
modifications sites and target molecules of the guide regions located upstream of the D’ box (d) or D box (D).

"""

In [None]:



            
class snoRNA(complex_area):
    
    def __init__(self, name, seq, box, structure):
        self.iv = None
        self.sub_areas = {}
        self.regions = {}
        
        self.name, self.seq, self.box = name, seq, box
        self.structure = structure
        self.c_box, self.c_prime_box, self.d_box, self.d_prime_box = [None] * 4
        self.boxes = []
        self.parse_seq()
    
    def check(self, re_result):
        if re_result is None:
            return False
        if len(re_result.groups()) > 1:
            print("??", re_result)
            return False
        return True
    
    def parse_seq(self):
        cp = re.search('(c+)', self.box)
        if self.check(cp):
            self.c_prime_box = (cp.start(), cp.end())
        C = re.search('(C+)', self.box)
        if self.check(C):
            self.c_box = (C.start(), C.end())
        dp = re.search('(d+)', self.box)
        if self.check(dp):
            self.d_prime_box = (dp.start(), dp.end())
        D = re.search('(D+)', self.box)
        if self.check(D):
            self.d_box = (D.start(), D.end())
        self.boxes = {'C': self.c_box, 
                      'Cp': self.c_prime_box, 
                      'D': self.d_box, 
                      'Dp': self.d_prime_box}
        self.define_CD_box_snoRNA_regions()
    
    def name_from_split(self, sep='|'):
        self.name = self.name.split(sep)[0]
        self.name = self.name.lstrip('>')

    def define_CD_box_snoRNA_regions(self):
        all_regions = ['preC', 'C', 'loop1', 'Dp', 'loop2', 'Cp', 'loop3', 'D', 'postD']
        lens = {}
        regions = dict(zip(self.boxes.keys(), self.boxes.values())) # Copy.
        if self.boxes['C'] is not None:
            lens['preC'] = self.boxes['C'][0]
            regions['preC'] = (0, self.boxes['C'][0])
            lens['C'] = self.boxes['C'][1] - self.boxes['C'][0]
            if self.boxes['Dp'] is not None:
                lens['loop1'] = self.boxes['Dp'][0] - self.boxes['C'][1]
                regions['loop1'] = (self.boxes['C'][1], self.boxes['Dp'][0])
            elif self.boxes['Cp'] is not None:
                lens['C to Cp'] = self.boxes['Cp'][0] - self.boxes['C'][1]
                regions['C to Cp'] = (self.boxes['C'][1], self.boxes['Cp'][0])
            elif self.boxes['D'] is not None:
                lens['C to D'] = self.boxes['D'][0] - self.boxes['C'][1]
                regions['C to D'] = (self.boxes['C'][1], self.boxes['D'][0])
            else:
                lens['C to end'] = len(self.seq) - self.boxes['C'][1]
                regions['C to end'] = (self.boxes['C'][1], len(self.seq))
        if self.boxes['Dp'] is not None:
            lens['Dp'] = self.boxes['Dp'][1] - self.boxes['Dp'][0]
            if self.boxes['Dp'] is not None:
                lens['loop2'] = self.boxes['Cp'][0] - self.boxes['Dp'][1]
                regions['loop2'] = (self.boxes['Dp'][1], self.boxes['Cp'][0])
            elif self.boxes['D'] is not None:
                lens['Dp to D'] = self.boxes['D'][0] - self.boxes['Dp'][1]
                regions['Dp to D'] = (self.boxes['Dp'][1], self.boxes['D'][0])
            else:
                lens['Dp to end'] = len(self.seq) - self.boxes['Dp'][1]
                regions['Dp to end'] = (self.boxes['Dp'][1], len(self.seq))
        if self.boxes['Cp'] is not None:
            lens['Cp'] = self.boxes['Cp'][1] - self.boxes['Cp'][0]
            if self.boxes['D'] is not None:
                lens['loop3'] = self.boxes['D'][0] - self.boxes['Cp'][1]
                regions['loop3'] = (self.boxes['Cp'][1], self.boxes['D'][0] )
            else:
                lens['Cp to end'] = len(self.seq) - self.boxes['Cp'][1]
                regions['Cp to end'] = (self.boxes['Cp'][1] - len(self.seq))
        if self.boxes['D'] is not None:
            lens['D'] = self.boxes['D'][1] - self.boxes['D'][0]
            lens['postD'] = len(self.seq) - self.boxes['D'][1]
            regions['postD'] = (self.boxes['D'][1], len(self.seq))
        region_sum = sum(lens.values())
        #print("Seq length {0}. region lens {1}".format(len(self.seq), region_sum))
        self.region_lens = lens
        self.set_regions(regions)
        return lens
    

In [None]:


def read_snoRNA_structures(
        ann='/Users/dfporter/pma/miseq/Proteins/FBL/reformat_boxes.txt'):
    a_sno = []
    snos = complex_area_set()
    for li in open(ann).readlines():
        if li[0] == '>':
            if len(a_sno) > 0:
                complex_area = snoRNA(*a_sno)
                complex_area.name_from_split()
                snos.add(complex_area)
            a_sno = [li.rstrip('\n')]
        else:     
            a_sno.append(li.rstrip('\n'))
    return snos
snos = read_snoRNA_structures()
coords = snoUtils.genomic_coordinates_of_snoRNAs()
snos.read_coordinates(coords)
print(coords)

print('len', len(snos.complex_areas))

In [None]:
def get_refseq_ids_of_targets(in_file='/Users/dfporter/pma/miseq/Proteins/counts.xls'):
    df = pandas.read_excel(in_file)
    df.sort_values(by=['FBL_170723_AGCTAG_AGT'], ascending=False, inplace=True)
    return set(df['Gene'].tolist()[:200])


ids = get_refseq_ids_of_targets()
snos_refseq = snoUtils.refseq_ids_of_snoRNAs()
ids = snos_refseq & ids
#fa = fasta()
#seqs = dict([(name, x) for name, x in fa.items() if name in ids])
#print("Seqs: {0}".format(len(seqs)))


In [None]:


def add_snoRNA_binding_from_a_bedgraph(
        snos, fname, norm_ga, cutoff=1, norm_len=50):
    print(fname)
    ga = bedgraphs(fname)
    snos.add_coverage(
            ga, norm_ga, norm_len=norm_len, cutoff=cutoff)
        #sno.add_coverage_to_normalized_length(
        #    ga, norm_ga, norm_len=norm_len, cutoff=cutoff)
        #print(norm_ga)
    return norm_ga


def add_snoRNA_binding_from_bedgraphs(snos, bedgraph_list, cutoff=1, norm_len=20):
    all_regions = ['preC', 'C', 'loop1', 'Dp', 'loop2', 'Cp', 'loop3', 'D', 'postD']
    
    # Determine the size of the average regions to normalize to.
    region_average_lens = snos.average_region_lengths()
    normalizer = signal_normalizer(region_average_lens)
    normalizer.add_region('Whole area', norm_len)
    for fname in bedgraph_list:
        print(fname)
        add_snoRNA_binding_from_a_bedgraph(
            snos, fname, normalizer, cutoff=cutoff, norm_len=norm_len)
    normalizer.normalize_regions_by_length()
    return normalizer

in_dir = '/Users/dfporter/pma/miseq/Runs/170830/sams/consensus/'
in_dir = '/Volumes/Seagate/hiseq/170924/sams/consensus/'
bedgraph_list = glob.glob(in_dir + '/{prefix}*_deletions_+.wig'.format(prefix='AGCTAG'))
print(bedgraph_list)
print(snos)
normalizer = add_snoRNA_binding_from_bedgraphs(snos, bedgraph_list)

#print(normalizer.ga)

In [None]:

#sns.hls_palette(8, l=.3, s=.8)

In [None]:
def plot_arr(norm_ga):
    print(norm_ga)
    xvals = range(len(norm_ga))
    yvals = np.array(norm_ga)
    plt.clf()
    plt.plot(xvals, yvals)
    plt.show()
    plt.clf()

norm_ga = normalizer.ga['Whole area']
print(norm_ga)
print('-' * 10)
all_regions = ['preC', 'C', 'loop1', 'Dp', 'loop2', 'Cp', 'loop3', 'D', 'postD']
combined = normalizer.put_regions_together(all_regions)

#plot_arr(combined)
plt.clf()
start = 0
pal = sns.cubehelix_palette(len(all_regions), start=18, rot=-.5)
for n, name in enumerate(all_regions):
    region_len = normalizer.lengths[name]
    print(region_len, len(normalizer.ga[name]))
    print(normalizer.ga[name])
    plt.plot(range(start, start+region_len), normalizer.ga[name], c=pal[n])
    start += region_len
plt.show()

In [None]:
all_regions = ['preC', 'C', 'loop1', 'Dp', 'loop2', 'Cp', 'loop3', 'D', 'postD']
fraction = {}
running_fraction = []
for region in all_regions:
    fraction[region] = normalizer.lengths[region]/len(combined)
    if region == 'preC':
        running_fraction.append(fraction[region])
    else:
        running_fraction.append(fraction[region] + running_fraction[-1])
print(fraction)
print(sum(fraction.values()))
print(running_fraction)
pre_c_a, pre_c_b = np.array([0, running_fraction[0]])  # preC, not plotted
seg1_a, seg1_b = np.array([running_fraction[0], running_fraction[1]])  # line up, C box
seg2_a, seg2_b = np.array([running_fraction[1], running_fraction[2]])  # left circle, loop
seg3_a, seg3_b = np.array([running_fraction[2], running_fraction[3]])  # line up on top, D prime box
D_to_C_a, D_to_C_b = np.array([running_fraction[3], running_fraction[4]])  # Dprime to Cprime box loop
seg4_a, seg4_b = np.array([running_fraction[4], running_fraction[5]])  # line down on top, C prime box
seg5_a, seg5_b = np.array([running_fraction[5], running_fraction[6]])  # right circle, loop
seg6_a, seg6_b = np.array([running_fraction[6], running_fraction[7]])  # line down bottom right, D box
post_d_a, post_d_b = np.array([running_fraction[7], running_fraction[8]])  # postD, not plotted

print(normalizer.lengths)
print(combined)
print(len(combined))
print(sum(normalizer.lengths.values()))

In [None]:
plt.clf()




c_seg1_a, c_seg1_b = np.array([10, 10.01])  # line up
c_seg2_a, c_seg2_b = np.array([0.0, 0.99])  # circle
c_seg3_a, c_seg3_b = np.array([10.99, 11.])  # line down 

def c_segmented_curve(t):

    x = 0
    y = 0
    deg = 0
    hairpin_width = 0.5
    x0 = 0.1
    y0 = 0.1
    if(c_seg1_a <= t < c_seg1_b):
        x = x0
        y = ((t-seg1_a)/(seg1_b - seg1_a)) + y0
        deg = 90
        
    if(c_seg2_a <= t < c_seg2_b):
        radius = 1 #(seg2_b - seg2_a)/3
        center = [0+x0, radius+1+y0]
        fraction_through = (t-seg2_a)/(seg2_b - seg2_a)
        compression = 0.15 + 0.7 * fraction_through
        v = 1-(compression) 
        x = center[0] + radius * np.cos(v*np.pi + np.pi/2)
        y = center[1] + radius * np.sin(v*np.pi + np.pi/2)
        slope_perpendicular = y/x
        deg = 180 + 90 + np.rad2deg(v*np.pi + np.pi/2)

    if (c_seg3_a <= t <= c_seg3_b):
        x = hairpin_width + x0
        y = 1 - ((t-seg6_a)/(seg6_b-seg6_a)) + y0
        deg = 270
    
    return x, y, deg



def segmented_curve(t):

    x = 0
    y = 0
    deg = 0
    hairpin_width = 0.1
    x0 = 0.1
    y0 = 0.1
    
    if(pre_c_a <= t < pre_c_b):
        deg = 60
        seg_len = pre_c_b - pre_c_a
        frac_through = (t - pre_c_a)/seg_len
        x = x0 - 1.5 + (frac_through) 
        y = y0 - 1.5 + frac_through
        
    if(seg1_a <= t < seg1_b):
        x = x0
        y = ((t-seg1_a)/(seg1_b - seg1_a)) + y0
        deg = 90
        
    if(seg2_a <= t < seg2_b):
        # at t = seg2_a, x,y = [0, 0.1]
        # at t = seg2_b, x,y = [0, 0.3]
        # x gets divided by two so that it goes from 0 to 0.5
        # diameter must therefore be seg2_b - seg2_a
        radius = 1 #(seg2_b - seg2_a)/3
        center = [0+x0, radius+1+y0]
        fraction_through = (t-seg2_a)/(seg2_b - seg2_a)
        
        compression = 0.10 + 0.85 * fraction_through
        v = 1-(compression) 
        
        x = center[0] + radius * np.cos(v*np.pi + np.pi/2)
        y = center[1] + radius * np.sin(v*np.pi + np.pi/2)
        slope_perpendicular = y/x
        deg = 180 + 90 + np.rad2deg(v*np.pi + np.pi/2)
        
    if(seg3_a <= t < seg3_b):
        x = x0
        y = ((t-seg3_a)/(seg3_b - seg3_a)) + 3 + y0
        deg = 90
        
    if(D_to_C_a <= t < D_to_C_b):
        radius = 0.5 #(seg2_b - seg2_a)/3
        center = [1+x0, radius+4+y0]
        fraction_through = (t - D_to_C_a)/(D_to_C_b - D_to_C_a)
        print(t, fraction_through)
        compression = 0.10 + 0.85 * fraction_through
        v = 1-(compression) 
        
        x = center[0] + radius * np.cos(2*v*np.pi + np.pi/2)
        y = center[1] + radius * np.sin(2*v*np.pi + np.pi/2)
        slope_perpendicular = y/x
        deg = 180 + 90 + np.rad2deg(v*np.pi + np.pi/2)   
        
    if(seg4_a <= t < seg4_b):
        x = hairpin_width + x0
        y = 4 - ((t-seg4_a)/(seg4_b-seg4_a)) + y0
        deg = 270
        
    if(seg5_a <= t < seg5_b):
        radius = 1
        center = [hairpin_width+x0, radius+1+y0]
        fraction_through = (t-seg5_a)/(seg5_b - seg5_a)
        compression = 0.1 + 0.85 * fraction_through
        v = 1 - compression
        x = center[0] + radius * np.cos(v*np.pi - np.pi/2)
        y = center[1] + radius * np.sin(v*np.pi - np.pi/2)
        deg = 270 + np.rad2deg(v*np.pi - np.pi/2)
    
    if (seg6_a <= t <= seg6_b):
        x = hairpin_width + x0
        y = 1 - ((t-seg6_a)/(seg6_b-seg6_a)) + y0
        deg = 270
    if(post_d_a <= t < post_d_b):
        deg = 270
        seg_len = post_d_b - post_d_a
        frac_through = (t - post_d_a)/seg_len
        x = hairpin_width + x0 - frac_through
        y =  -0.5 + y0  - frac_through
    return x, y, deg

xvals = range(len(combined))
yvals = np.array(combined)/1000
fig1 = plt.figure()
ax1 = fig1.add_subplot(111, aspect='equal')
position = np.arange(len(xvals))

pal = sns.color_palette("RdBu_r", 100)
sns.palplot(pal)
plt.show()
plt.clf()
#pal = sns.cubehelix_palette(100)
def to_color(val, pal):
    _min = 0
    _max = max(yvals)
    return pal[int(100*val/_max)-1]


print(normalizer.lengths)
for pos, x, bar_length in zip(position, xvals, yvals):
    #mybars = plt.rectangle(pos, y, align='center', linewidth=0)
    #plt.xticks(position, x)
    t = pos/len(position)

    x, y, deg = segmented_curve(pos/len(position))
    #deg = tangent(t)
    ax1.add_patch(patches.Circle(
        (x, y),   # (x,y)
        linestyle='None',
        color=to_color(bar_length, pal),
        radius=.05,          # width
        #bar_length,          # height
        #angle=deg,
    ))
ax1.set_xlim(-4, 3)
ax1.set_ylim(-3, 8)

#for spine in plt.gca().spines.values():

fig1.set_figwidth(10)
fig1.set_figheight(10)
#plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='off', labelbottom='on')
#plt.show()
plt.savefig('/Users/dfporter/pma/easyCLIP paper/snoRNA_bargraph.pdf')
plt.clf(); plt.close()
