Skip to content

Analysis comparing the use of alternative trans-splicing and alternative-polyadenylation sites across developmental stages in Leishmania major

Notifications You must be signed in to change notification settings

elsayed-lab/lmajor_alternate_acceptor_site_usage

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

4 Commits
 
 
 
 

Repository files navigation

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta http-equiv="Content-Style-Type" content="text/css" />
  <meta name="generator" content="pandoc" />
  <title>L. major UTR Analysis</title>
  <style type="text/css">code{white-space: pre;}</style>
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <!-- jQuery -->
  <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/jqueryui/1.10.3/jquery-ui.min.js"></script>
  
  <!-- bootstrap -->
  <!--<link href="https://netdna.bootstrapcdn.com/bootstrap/3.0.0/css/bootstrap.min.css" rel="stylesheet"  id="style">-->
  <script src="https://netdna.bootstrapcdn.com/bootstrap/3.0.0/js/bootstrap.min.js"></script>
  
  <!-- highlight.js -->
  <!--<link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/7.3/styles/default.min.css" rel="stylesheet" id="code-style">-->
  <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/7.3/highlight.min.js"></script>
  <script>
  hljs.LANGUAGES.r=function(a){var b="([a-zA-Z]|\\.[a-zA-Z.])[a-zA-Z0-9._]*";return{c:[a.HCM,{b:b,l:b,k:{keyword:"function if in break next repeat else for return switch while try tryCatch|10 stop warning require library attach detach source setMethod setGeneric setGroupGeneric setClass ...|10",literal:"NULL NA TRUE FALSE T F Inf NaN NA_integer_|10 NA_real_|10 NA_character_|10 NA_complex_|10"},r:0},{cN:"number",b:"0[xX][0-9a-fA-F]+[Li]?\\b",r:0},{cN:"number",b:"\\d+(?:[eE][+\\-]?\\d*)?L\\b",r:0},{cN:"number",b:"\\d+\\.(?!\\d)(?:i\\b)?",r:0},{cN:"number",b:"\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",r:0},{b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[a.BE],r:0},{cN:"string",b:"'",e:"'",c:[a.BE],r:0}]}}(hljs); </script>
  <!--<script type="text/javascript", src="https://yandex.st/highlightjs/7.3/languages/r.min.js"></script>-->
  
  <!-- Manific Popup -->
  <script src="https://cdnjs.cloudflare.com/ajax/libs/magnific-popup.js/0.8.9/jquery.magnific-popup.min.js"></script>
  
  <script type="text/javascript"
    src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
  </script>
  <script defer="defer">
  // Function to generate the dynamic table of contents
  jQuery.fn.generate_TOC = function () {
    var base = $(this[0]);
  
    var selectors = ['h1', 'h2', 'h3', 'h4'];
  
    var last_ptr = [{}, {}, {}, {}];
  
    var anchors = {};
  
    generate_anchor = function (text) {
      var test = text.replace(/\W/g, '_');
  
      while(test in anchors){
        //if no suffix, add one
        if(test.match(/_\d+$/) === null){
          test = test + "_2";
        }
        //else generate unique id for duplicates by adding one to the suffix
        else {
          test = test.replace(/_(\d+)$/, function(match, number){ var num=+number+1; return("_" + num) });
        }
      }
      anchors[test]=1;
      return(test);
    }
  
    $(selectors.join(',')).filter(function(index) { return $(this).parent().attr("id") != 'header'; }).each(function () {
  
      var heading = $(this);
      var idx = selectors.indexOf(heading.prop('tagName').toLowerCase());
      var itr = 0;
  
      while (itr <= idx) {
        if (jQuery.isEmptyObject(last_ptr[itr])) {
          last_ptr[itr] = $('<ul>').addClass('nav');
          if (itr === 0) {
            base.append(last_ptr[itr])
          } else {
            if(last_ptr[itr-1].children('li').length === 0){
              last_ptr[itr-1].append(last_ptr[itr]);
            }
            else {
              last_ptr[itr - 1].children('li').last().append(last_ptr[itr]);
            }
          }
        }
        itr++;
      }
      var anchor = generate_anchor(heading.text());
      heading.attr('id', anchor);
      var a = $('<a>')
      .text(heading.text())
      .attr('href', '#' + anchor);
  
    var li = $('<li>')
      .append(a);
  
    last_ptr[idx].append(li);
    for (i = idx + 1; i < last_ptr.length; i++) {
      last_ptr[i] = {};
    }
    });
  }
  /* run scripts when document is ready */
  $(function() {
    "use strict";
  
    var $window = $(window);
    var $body = $(document.body);
  
    /* size of thumbnails */
  
    var hidden_types = ['source']
    var output_types = ['output', 'message', 'warning', 'error']
  
    /* style tables */
    $('table').addClass('table table-striped table-bordered table-hover table-condensed');
  
    $('pre code').each(function(i, e) {
      hljs.highlightBlock(e);
    });
  
    /* Magnific Popup */
    $(".thumbnail").each(function(){
      $(this).magnificPopup({
        disableOn: 768,
        closeOnContentClick: true,
  
        type: 'image',
        items: {
          src: $(this).find('img').attr('src'),
        }
      });
    });
  
    function toggle_block(obj, show) {
      var span = obj.find('span');
      if(show === true){
        span.removeClass('glyphicon-chevron-up').addClass('glyphicon-chevron-down');
        obj.next('pre').slideDown();
      }
      else {
        span.removeClass('glyphicon-chevron-down').addClass('glyphicon-chevron-up');
        obj.next('pre').slideUp();
      }
    }
  
    function toggle_thumbnails(imgs, show){
      if(show === true){
        imgs.parents().show()
        imgs.slideDown();
      }
      else {
        imgs.slideUp(400, function(){ $(this).parent().hide(); });
      }
    }
  
    function global_toggle(obj){
      var type = obj.attr('type');
      var show = !obj.parent('li').hasClass('active');
      if(show === true){
        obj.parent('li').addClass('active');
      }
      else{
        obj.parent('li').removeClass('active');
      }
      if(type == 'figure'){
        toggle_thumbnails($('.thumbnail img'), show);
      }
      else {
        $('.toggle.' + type).each(function() { toggle_block($(this), show); });
      }
    }
  
    /* onclick toggle next code block */
    $('.toggle').click(function() {
      var span = $(this).find('span');
      toggle_block($(this), !span.hasClass('glyphicon-chevron-down'));
      return false
    })
  
    // global toggles
    $('.toggle-global').click(function(){
      var type = $(this).attr('type');
      if(type === 'all-source'){
          $('li a.source').each(function() {
            global_toggle($(this));
          });
        }
      else if(type === 'all-output'){
        $.each(output_types, function(i, val){
          console.log(val);
          global_toggle($('li a.' + val));
        });
      }
      else {
        console.log($(this));
        global_toggle($(this));
      }
      return false;
    });
    /* table of contents */
    if($(['h1', 'h2', 'h3', 'h4'].join(',')).length > 0){
      $('body > #wrap > .container > .row').append('<div class="col-md-2"><div id="toc" class="well sidebar sidenav affix hidden-print"/></div>');
      $('#toc').generate_TOC();
    }
  
    $.each(hidden_types, function(i, type) {
      $('li[type=' + type + ']').each(function(){ global_toggle($(this)); });
    });
  
    /* remove paragraphs with no content */
    $('p:empty').remove();
  
    $body.scrollspy({
      target: '.sidebar',
    });
  
    /* theme switch */
    $('.theme-switch').click(function(){
      var css = $('link[title=' + $(this).attr('title') + ']');
      $('#theme[rel=stylesheet]').attr('href', css.attr('href'));
      $('.theme-switch').closest('li').removeClass('active');
      $(this).closest('li').addClass('active');
      return false;
    });
    /* code style switch */ //TODO use same function for both of these?
    $('.highlight-switch').click(function(){
      var css = $('link[title="' + $(this).attr('title') + '"]');
      $('#highlight[rel=stylesheet]').attr('href', css.attr('href'));
      $('.highlight-switch').closest('li').removeClass('active');
      $(this).closest('li').addClass('active');
      return false;
    });
  
    //TODO refresh on show/hide
    $window.on('load', function () {
      $body.scrollspy('refresh');
    })
  
  });
  
  </script>
  <style>
  /* Knitr_bootstrap styles */
  #header {
    display: none !important;
    visibility: hidden !important;
  }
  #wrap .container-fluid {
    padding: 0;
    overflow: hidden;
  }
  .toggle{
    text-transform: capitalize;
  }
  
  .toggle-global{
    text-transform: capitalize;
  }
  
  /* Sticky footer styles */
  * {
    margin:0;
  }
  html,
  body {
      height: 100%;
      padding:0 !important;
      /* The html and body elements cannot have any padding or margin. */
      /*overflow-x: hidden;*/
  }
  
  /* Wrapper for page content to push down footer */
  #wrap {
      min-height: 100%;
      height: auto !important;
      height: 100%;
      /* Negative indent footer by it's height */
      margin: 0 auto -120px;
  }
  
  /* Set the fixed height of the footer here */
  #push,
  #footer {
      height: 120px;
  }
  
  #footer {
    text-align: center;
  }
  
  /* Top level subheader elements.  These are the first nested items underneath a header element. */
  .header li {
    font-size: 20px;
  }
  
  /* Makes the font smaller for all subheader elements. */
  .sub-header li {
      font-size: 12px;
  }
  
  button.thumbnails {
    margin-left:0px;
  }
  
  /*
   * Side navigation
   *
   * Scrollspy and affixed enhanced navigation to highlight sections and secondary
   * sections of docs content.
   */
  
  /* By default it's not affixed in mobile views, so undo that */
  .sidebar.affix {
    position: static;
  }
  
  /* First level of nav */
  .sidenav {
    margin-top: 30px;
    margin-bottom: 30px;
    padding-top:    10px;
    padding-bottom: 10px;
    border-radius: 5px;
  }
  
  /* All levels of nav */
  .sidebar .nav > li > a {
    display: block;
    padding: 5px 20px;
  }
  .sidebar .nav > li > a:hover,
  .sidebar .nav > li > a:focus {
    text-decoration: none;
    border-right: 1px solid;
  }
  .sidebar .nav > .active > a,
  .sidebar .nav > .active:hover > a,
  .sidebar .nav > .active:focus > a {
    font-weight: bold;
    background-color: transparent;
    border-right: 1px solid;
  }
  
  /* Nav: second level (shown on .active) */
  .sidebar .nav .nav {
    display: none; /* Hide by default, but at >768px, show it */
    margin-bottom: 8px;
  }
  .sidebar .nav .nav > li > a {
    padding-top:    3px;
    padding-bottom: 3px;
    padding-left: 30px;
    font-size: 90%;
  }
  
  .sidebar .nav .nav .nav > li > a {
    padding-left: 40px;
  }
  .sidebar .nav .nav .nav .nav > li > a {
    padding-left: 50px;
  }
  
  /* Show and affix the side nav when space allows it */
  @media screen and (min-width: 992px) {
    .sidebar .nav > .active > ul {
      display: block;
    }
    /* Widen the fixed sidebar */
    .sidebar.affix,
    .sidebar.affix-bottom {
      width: 213px;
    }
    .sidebar.affix-top,
    .sidebar.affix {
      position: fixed; /* Undo the static from mobile first approach */
      top: 30px;
    }
    .sidebar.affix-bottom {
      position: absolute; /* Undo the static from mobile first approach */
    }
    .sidebar.affix-bottom .sidenav,
    .sidebar.affix .sidenav {
      margin-top: 0;
      margin-bottom: 0;
    }
  }
  @media screen and (min-width: 1200px) {
    /* Widen the fixed sidebar again */
    .sidebar.affix-bottom,
    .sidebar.affix {
      width: 263px;
    }
  }
  
  #toc {
    padding: 10px 0px;
    margin:0;
    border:0;
  }
  
  
  .panel pre {
    margin: 0;
    padding: 0;
    border: 0;
  }
  button + pre {
    margin: 0;
    padding: 0;
  }
  pre code {
    border-radius: 0;
  }
  /* Magnific Popup CSS */
  .mfp-bg {
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    z-index: 1042;
    overflow: hidden;
    position: fixed;
    background: #0b0b0b;
    opacity: 0.8;
    filter: alpha(opacity=80); }
  
  .mfp-wrap {
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    z-index: 1043;
    position: fixed;
    outline: none !important;
    -webkit-backface-visibility: hidden; }
  
  .mfp-container {
    text-align: center;
    position: absolute;
    width: 100%;
    height: 100%;
    left: 0;
    top: 0;
    padding: 0 8px;
    -webkit-box-sizing: border-box;
    -moz-box-sizing: border-box;
    box-sizing: border-box; }
  
  .mfp-container:before {
    content: '';
    display: inline-block;
    height: 100%;
    vertical-align: middle; }
  
  .mfp-align-top .mfp-container:before {
    display: none; }
  
  .mfp-content {
    position: relative;
    display: inline-block;
    vertical-align: middle;
    margin: 0 auto;
    text-align: left;
    z-index: 1045; }
  
  .mfp-inline-holder .mfp-content,
  .mfp-ajax-holder .mfp-content {
    width: 100%;
    cursor: auto; }
  
  .mfp-ajax-cur {
    cursor: progress; }
  
  .mfp-zoom-out-cur,
  .mfp-zoom-out-cur .mfp-image-holder .mfp-close {
    cursor: -moz-zoom-out;
    cursor: -webkit-zoom-out;
    cursor: zoom-out; }
  
  .mfp-zoom {
    cursor: pointer;
    cursor: -webkit-zoom-in;
    cursor: -moz-zoom-in;
    cursor: zoom-in; }
  
  .mfp-auto-cursor .mfp-content {
    cursor: auto; }
  
  .mfp-close,
  .mfp-arrow,
  .mfp-preloader,
  .mfp-counter {
    -webkit-user-select: none;
    -moz-user-select: none;
    user-select: none; }
  
  .mfp-loading.mfp-figure {
    display: none; }
  
  .mfp-hide {
    display: none !important; }
  
  .mfp-preloader {
    color: #cccccc;
    position: absolute;
    top: 50%;
    width: auto;
    text-align: center;
    margin-top: -0.8em;
    left: 8px;
    right: 8px;
    z-index: 1044; }
  
  .mfp-preloader a {
    color: #cccccc; }
  
  .mfp-preloader a:hover {
    color: white; }
  
  .mfp-s-ready .mfp-preloader {
    display: none; }
  
  .mfp-s-error .mfp-content {
    display: none; }
  
  button.mfp-close,
  button.mfp-arrow {
    overflow: visible;
    cursor: pointer;
    background: transparent;
    border: 0;
    -webkit-appearance: none;
    display: block;
    padding: 0;
    z-index: 1046;
    -webkit-box-shadow: none;
    box-shadow: none; }
  
  button::-moz-focus-inner {
    padding: 0;
    border: 0; }
  
  .mfp-close {
    width: 44px;
    height: 44px;
    line-height: 44px;
    position: absolute;
    right: 0;
    top: 0;
    text-decoration: none;
    text-align: center;
    opacity: 0.65;
    padding: 0 0 18px 10px;
    color: white;
    font-style: normal;
    font-size: 28px;
    font-family: Arial, Baskerville, monospace; }
    .mfp-close:hover, .mfp-close:focus {
      opacity: 1; }
    .mfp-close:active {
      top: 1px; }
  
  .mfp-close-btn-in .mfp-close {
    color: #333333; }
  
  .mfp-image-holder .mfp-close,
  .mfp-iframe-holder .mfp-close {
    color: white;
    right: -6px;
    text-align: right;
    padding-right: 6px;
    width: 100%; }
  
  .mfp-counter {
    position: absolute;
    top: 0;
    right: 0;
    color: #cccccc;
    font-size: 12px;
    line-height: 18px; }
  
  .mfp-arrow {
    position: absolute;
    opacity: 0.65;
    margin: 0;
    top: 50%;
    margin-top: -55px;
    padding: 0;
    width: 90px;
    height: 110px;
    -webkit-tap-highlight-color: rgba(0, 0, 0, 0); }
  
  .mfp-arrow:active {
    margin-top: -54px; }
  
  .mfp-arrow:hover,
  .mfp-arrow:focus {
    opacity: 1; }
  
  .mfp-arrow:before, .mfp-arrow:after,
  .mfp-arrow .mfp-b,
  .mfp-arrow .mfp-a {
    content: '';
    display: block;
    width: 0;
    height: 0;
    position: absolute;
    left: 0;
    top: 0;
    margin-top: 35px;
    margin-left: 35px;
    border: medium inset transparent; }
  .mfp-arrow:after,
  .mfp-arrow .mfp-a {
    border-top-width: 13px;
    border-bottom-width: 13px;
    top: 8px; }
  .mfp-arrow:before,
  .mfp-arrow .mfp-b {
    border-top-width: 21px;
    border-bottom-width: 21px; }
  
  .mfp-arrow-left {
    left: 0; }
    .mfp-arrow-left:after,
    .mfp-arrow-left .mfp-a {
      border-right: 17px solid white;
      margin-left: 31px; }
    .mfp-arrow-left:before,
    .mfp-arrow-left .mfp-b {
      margin-left: 25px;
      border-right: 27px solid #3f3f3f; }
  
  .mfp-arrow-right {
    right: 0; }
    .mfp-arrow-right:after,
    .mfp-arrow-right .mfp-a {
      border-left: 17px solid white;
      margin-left: 39px; }
    .mfp-arrow-right:before,
    .mfp-arrow-right .mfp-b {
      border-left: 27px solid #3f3f3f; }
  
  .mfp-iframe-holder {
    padding-top: 40px;
    padding-bottom: 40px; }
  
  .mfp-iframe-holder .mfp-content {
    line-height: 0;
    width: 100%;
    max-width: 900px; }
  
  .mfp-iframe-scaler {
    width: 100%;
    height: 0;
    overflow: hidden;
    padding-top: 56.25%; }
  
  .mfp-iframe-scaler iframe {
    position: absolute;
    display: block;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    box-shadow: 0 0 8px rgba(0, 0, 0, 0.6);
    background: black; }
  
  .mfp-iframe-holder .mfp-close {
    top: -40px; }
  
  /* Main image in popup */
  img.mfp-img {
    width: auto;
    max-width: 100%;
    height: auto;
    display: block;
    line-height: 0;
    -webkit-box-sizing: border-box;
    -moz-box-sizing: border-box;
    box-sizing: border-box;
    padding: 40px 0 40px;
    margin: 0 auto; }
  
  /* The shadow behind the image */
  .mfp-figure:after {
    content: '';
    position: absolute;
    left: 0;
    top: 40px;
    bottom: 40px;
    display: block;
    right: 0;
    width: auto;
    height: auto;
    z-index: -1;
    box-shadow: 0 0 8px rgba(0, 0, 0, 0.6);
    background: #444444; }
  
  .mfp-figure {
    line-height: 0; }
  
  .mfp-bottom-bar {
    margin-top: -36px;
    position: absolute;
    top: 100%;
    left: 0;
    width: 100%;
    cursor: auto; }
  
  .mfp-title {
    text-align: left;
    line-height: 18px;
    color: #f3f3f3;
    word-wrap: break-word;
    padding-right: 36px; }
  
  .mfp-figure small {
    color: #bdbdbd;
    display: block;
    font-size: 12px;
    line-height: 14px; }
  
  .mfp-image-holder .mfp-content {
    max-width: 100%; }
  
  .mfp-gallery .mfp-image-holder .mfp-figure {
    cursor: pointer; }
  
  @media screen and (max-width: 800px) and (orientation: landscape), screen and (max-height: 300px) {
    /**
     * Remove all paddings around the image on small screen
     */
    .mfp-img-mobile .mfp-image-holder {
      padding-left: 0;
      padding-right: 0; }
  
    .mfp-img-mobile img.mfp-img {
      padding: 0; }
  
    /* The shadow behind the image */
    .mfp-img-mobile .mfp-figure:after {
      top: 0;
      bottom: 0; }
  
    .mfp-img-mobile .mfp-bottom-bar {
      background: rgba(0, 0, 0, 0.6);
      bottom: 0;
      margin: 0;
      top: auto;
      padding: 3px 5px;
      position: fixed;
      -webkit-box-sizing: border-box;
      -moz-box-sizing: border-box;
      box-sizing: border-box; }
  
    .mfp-img-mobile .mfp-bottom-bar:empty {
      padding: 0; }
  
    .mfp-img-mobile .mfp-counter {
      right: 5px;
      top: 3px; }
  
    .mfp-img-mobile .mfp-close {
      top: 0;
      right: 0;
      width: 35px;
      height: 35px;
      line-height: 35px;
      background: rgba(0, 0, 0, 0.6);
      position: fixed;
      text-align: center;
      padding: 0; }
  
    .mfp-img-mobile .mfp-figure small {
      display: inline;
      margin-left: 5px; } }
  @media all and (max-width: 900px) {
    .mfp-arrow {
      -webkit-transform: scale(0.75);
      transform: scale(0.75); }
  
    .mfp-arrow-left {
      -webkit-transform-origin: 0;
      transform-origin: 0; }
  
    .mfp-arrow-right {
      -webkit-transform-origin: 100%;
      transform-origin: 100%; }
  
    .mfp-container {
      padding-left: 6px;
      padding-right: 6px; } }
  .mfp-ie7 .mfp-img {
    padding: 0; }
  .mfp-ie7 .mfp-bottom-bar {
    width: 600px;
    left: 50%;
    margin-left: -300px;
    margin-top: 5px;
    padding-bottom: 5px; }
  .mfp-ie7 .mfp-container {
    padding: 0; }
  .mfp-ie7 .mfp-content {
    padding-top: 44px; }
  .mfp-ie7 .mfp-close {
    top: 0;
    right: 0;
    padding-top: 0; }
  
  //Magnific overrides
  .mfp-image img{
    background: white;
  }
  .mfp-figure:after {
    background: white;
  }
  
  /*
   * Off Canvas navbar toggle right
   * --------------------------------------------------
   */
  
  @media screen and (max-width: 768px) {
    .row-offcanvas .collapsing {
    -webkit-transition: none 0;
      -moz-transition: none 0;
      transition: none 0;
    }
   .row-offcanvas .navbar {
    position: absolute;
    z-index: 2;
      right:0;
      height:100%;
      width:55px;
      border:0;
      background-color:transparent;
    }
    .row-offcanvas .navbar-toggle {
      margin-right: 5px;
      margin-left: 5px;
    }
    .row-offcanvas {
      position: relative;
    }
    .row-offcanvas-right.active .navbar {
    position: absolute;
    z-index: 2;
      right: -28.4%;
      width:40%;
      background-color:#eee;
      border:0 solid #ddd;
      border-left-width:1px;
    }
    .row-offcanvas-right.active {
      left: -30%;
    }
    .row-offcanvas-right.active .navbar-collapse {
      position: relative;
      width: 100%;
    }
    .row-offcanvas .content {
    /*width:calc(100% - 60px);*/
    }
  }
  </style>
</head>
<body>
<div id="header">
<h1 class="title">L. major UTR Analysis</h1>
</div>
<div id="wrap">
<div class="container">
<div class="row row-offcanvas row-offcanvas-right">
<div class="contents col-xs-12 col-md-10">
<h1>L. major UTR Length and Alternative Trans-splicing / Poly-adenylation Analysis</h1>
<h2>Overview</h2>
<p>The goal of this analysis is to parse the output from our <a href="https://github.com/khughitt/utr_analysis">UTR analysis pipeline</a>, perform some basic smoothing of the spliced leader and polya acceptor site counts and determine the most likely primary UTR boundaries for each gene where information is available. 5'- and 3'-UTR boundaries, along with some other useful information such as UTR GC- and CT-richness will be outputted in a format that is convenient for downstream analysis.</p>
<p>Finally, we will look for evidence of either alternative trans-splicing or poly-adenylation and attempt to visualize the prevelance and magnitude of these events across the different developmental stages.</p>
<h2>TODO</h2>
<ul>
<li>Include Trey's uORFs</li>
<li>Removal RNAs (SLRNA, etc)</li>
</ul>
<div class="row">
<a href='mailto:khughitt@umd.edu'>Keith Hughitt</a> (<time>2015-02-02</time>)
</div>
<h2>Settings</h2>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Number of individual UTR plots to display for each developmental stage and
# UTR side. The purpose of these plots are to provide a sense of the data
# distribution and to visualize the effect of primary # site selection with and
# without smoothing.
max_plots = 10</code></pre>
</div>
<p><a href="README.rmd">view source</a></p>
<h2>Methods</h2>
<h3>Load data</h3>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">library(GenomicRanges)
library(Biostrings)
library(rtracklayer)
library(ggplot2)
library(ggvis)
library(reshape2)
library(dplyr)

# Genome sequence and annotations from TriTrypDB (8.0)
gff = import.gff(file.path('/cbcb/lab/nelsayed/ref_data/lmajor_friedlin',
                           '/annotation/TriTrypDB-8.0_LmajorFriedlin.gff'),
                 version='3')

chromosomes = gff[gff$type == 'chromosome']
genes       = gff[gff$type == 'gene']

# Load unannotated ORFs detected from ribosome profiling data
uorfs = import.gff(gzfile('input/lmajor_orfs_manual_2014-09-08_clean.gff.gz'), version='3')
uorfs$description = 'Unannotated ORF'

# Drop GFF columns not shared betwene TriTrypDB GFF and uORF GFF
keep_cols = intersect(colnames(mcols(genes)), colnames(mcols(uorfs)))
genes = genes[,keep_cols]
uorfs = uorfs[,keep_cols]

genes = append(genes, uorfs)

input_fasta = file.path(Sys.getenv("REF"), 
                        "lmajor_friedlin/genome/TriTrypDB-8.0_LmajorFriedlin_Genome.fasta")
fasta = readDNAStringSet(input_fasta)

# Fix names (L. major chromosome identifiers)
names(fasta) = substring(names(fasta), 0, 7)

# Number of bases flanking CDS to scan for motifs when actual UTR length is not
# known; numbers are based on median 5' and 3' UTR lengths for L. major
default_5utr_width = 250
default_3utr_width = 575

# Taken from L. major UTR analysis output from 2014/11/14
procyclic_polya  = import.gff(gzfile('input/lmajor_procyclic_ncrnas_removed_polya.gff.gz'), version='3')
metacyclic_polya = import.gff(gzfile('input/lmajor_metacyclic_ncrnas_removed_polya.gff.gz'), version='3')
amastigote_polya = import.gff(gzfile('input/lmajor_amastigote_ncrnas_removed_polya.gff.gz'), version='3')
procyclic_sl  = import.gff(gzfile('input/lmajor_procyclic_ncrnas_removed_sl.gff.gz'), version='3')
metacyclic_sl = import.gff(gzfile('input/lmajor_metacyclic_ncrnas_removed_sl.gff.gz'), version='3')
amastigote_sl = import.gff(gzfile('input/lmajor_amastigote_ncrnas_removed_sl.gff.gz'), version='3')

# Filter noncoding RNAs
id_filter_string = 'rRNA|snRNA|snoRNA|SLRNA|TRNA|SRP'
noncoding_ids = genes$ID[grepl(id_filter_string, genes$ID)]

genes            = genes[!genes$ID %in% noncoding_ids]
procyclic_polya  = procyclic_polya[!procyclic_polya$Name %in% noncoding_ids,]
metacyclic_polya = metacyclic_polya[!metacyclic_polya$Name %in% noncoding_ids,]
amastigote_polya = amastigote_polya[!amastigote_polya$Name %in% noncoding_ids,]
procyclic_sl     = procyclic_sl[!procyclic_sl$Name %in% noncoding_ids,]
metacyclic_sl    = metacyclic_sl[!metacyclic_sl$Name %in% noncoding_ids,]
amastigote_sl    = amastigote_sl[!amastigote_sl$Name %in% noncoding_ids,]

# Create output and build directories if needed
for (x in c('build', 'output')) {
    if (!file.exists(x)) {
        dir.create(x)
    }
}</code></pre>
</div>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">#
# find_peak
#
find_peak = function(gene, acceptor_sites, gene_strand, feature_type,
                     smoothed=FALSE, include_plot=FALSE, secondary=FALSE) {
    # Determine orientation of feature relative to CDS
    if ((feature_type == 'sl'    && gene_strand == '+') ||
        (feature_type == 'polya' && gene_strand == '-')) {
        feature_side = 'left'
    } else {
        feature_side = 'right'
    }

    # Create a vector from the furthest SL site to the CDS boundary
    if (feature_side == 'left') {
        cds_boundary = start(gene)
        raw_scores = rep(0, cds_boundary - min(start(acceptor_sites)))
        rel_start =  cds_boundary - start(acceptor_sites) + 1
    } else {
        cds_boundary = end(gene)
        raw_scores = rep(0, max(start(acceptor_sites)) - cds_boundary)
        rel_start = start(acceptor_sites) - cds_boundary + 1
    }

    # add scores at indices where acceptor sites were detected
    raw_scores[rel_start] = score(acceptor_sites)
    x = 1:length(raw_scores)

    # create smooted version of scores (optional)
    if (smoothed) {
        input_scores = ksmooth(x, raw_scores, kernel="normal", bandwidth=6,
                               n.points=length(x))$y
    } else {
        input_scores = raw_scores
    }

    # find acceptor site peak
    if (secondary && (length(acceptor_sites) >= 2)) {
        # secondary site
        j = 2
    } else {
        # primary site
        j = 1
    }
    sorted_scores = sort(input_scores, decreasing=TRUE)
    raw_idx       = which(input_scores == sorted_scores[j])

    if (length(raw_idx) > 1) {
        if (feature_side == 'left') {
            raw_idx = head(raw_idx, 1)
        } else {
            raw_idx = tail(raw_idx, 1)
        }
    }

    # find raw score (number of reads) for the desired primary or secondary
    # site peak
    raw_score = raw_scores[raw_idx]

    # If smoothing was used, the actual location of the smoothed peak may not
    # have any support. In this case we will find the nearest site to the peak
    # with a non-zero score in the raw scores vector
    while(raw_score == 0) {
        # find next highest peak
        j = j + 1 
        raw_idx = which(input_scores == sorted_scores[j])

        # if more than one peak, choose the furthest one from CDS
        if (length(raw_idx) > 1) {
            if (feature_side == 'left') {
                raw_idx = head(raw_idx, 1)
            } else {
                raw_idx = tail(raw_idx, 1)
            }
        }
        raw_score = raw_scores[raw_idx]
    }

    # plot raw and smoothed peaks
    if (include_plot) {
        if (smoothed) {
            df = melt(data.frame(x, raw=raw_scores, smoothed=input_scores),
                      id=c("x"), variable.name='type')
            print(qplot(x, value, data=df, color=type, geom='line') 
                  + ggtitle(gene$ID))
        } else {
            df = melt(data.frame(x, raw=raw_scores), id=c("x"),
                    variable.name='type')
            print(qplot(x, value, data=df, geom='line')
                  + ggtitle(gene$ID))
        }
    }

    # find index of the site with the highest score (or second highest score,
    # in the case of secondary site selection)
    idx = which(score(acceptor_sites) == raw_score)

    # if there is a tie, use the furthest site from CDS
    if (length(idx) > 1) {
        if (feature_side == 'left') {
            idx = head(idx, 1)
        } else {
            idx = tail(idx, 1)
        }
    }
    return(idx)
}

#
# find_primary_site
#
# Determines the primary site among a list of acceptor sites and their
# associated score (number of reads mapped).
#
# Returns a list containing the site and score for the primary site.
#
find_primary_site = function(acceptor_sites, feature, gene, gene_strand, smoothed=FALSE) {
    if (length(acceptor_sites) == 0) {
        return(list(location=NA, num_reads=NA))
    } else if (length(acceptor_sites) == 1) {
        # if only one site found, use it
        return(list(location=start(acceptor_sites)[1],
                    num_reads=score(acceptor_sites)[1]))
    } else {
        # two or more sites

        # find highest smoothed peak which has non-zero coverage in the raw
        # data as well
        max_index = find_peak(gene, acceptor_sites, gene_strand, feature,
                              smoothed=smoothed)
        max_index_smoothed = find_peak(gene, acceptor_sites, gene_strand,
                                       feature, smoothed=TRUE)

        # if the smoothing would result in a different primary UTR choice,
        # plot the raw and smoothed versions of the data
        if (abs(max_index - max_index_smoothed) > 15) {
            num_diff = num_diff + 1
            if (plot_num <= max_plots) {
                sprintf("PLOTTING %d/%d", plot_num, max_plots)
                #tmp = find_peak(gene, acceptor_sites, gene_strand, feature,
                #                smoothed=TRUE, include_plot=TRUE)
                plot_num = plot_num + 1
            }
        }

        return(list(location=start(acceptor_sites)[max_index], 
                    num_reads=score(acceptor_sites)[max_index]))
    }
}

#
# find_secondary_site
#
# Determines the secondary site among a list of acceptor sites and their
# associated score (number of reads mapped).
#
# Returns a list containing the site and score for the secondary site.
#
find_secondary_site = function(acceptor_sites, feature, gene, gene_strand,
                               smoothed=FALSE) {
    if (length(acceptor_sites) == 0) {
        return(list(location=NA, num_reads=NA))
    } else if (length(acceptor_sites) == 1) {
        # if only one site found, use it
        return(list(location=start(acceptor_sites)[1],
                    num_reads=score(acceptor_sites)[1]))
    } else {
        # two or more sites

        # find highest smoothed peak which has non-zero coverage in the raw
        # data as well
        secondary_site_index = find_peak(gene, acceptor_sites, gene_strand,
                                         feature, smoothed=smoothed,
                                         secondary=TRUE)
        secondary_site_index_smoothed = find_peak(gene, acceptor_sites,
                                                  gene_strand, feature,
                                                  smoothed=TRUE,
                                                  secondary=TRUE)

        # if the smoothing would result in a different secondary UTR choice,
        # plot the raw and smoothed versions of the data
        #if (abs(secondary_site_index - secondary_site_index_smoothed) > 15) {
        #    num_diff = num_diff + 1
        #    if (plot_num <= max_plots) {
        #        tmp = find_peak(gene, acceptor_sites, gene_strand, feature,
        #                        smoothed=TRUE, include_plot=TRUE,
        #                        secondary=TRUE)
        #        plot_num = plot_num + 1
        #    }
        #}

        return(list(location=start(acceptor_sites)[secondary_site_index], 
                    num_reads=score(acceptor_sites)[secondary_site_index]))
    }
}

#
# get_utr_sequences
#
# Returns a vector of Biostrings instances containing the UTR sequence for each
# input gene.
#
get_utr_sequences = function(genes, fasta, utr_lengths, default_width,
                             utr5=TRUE) {
    # retrieve utr length if known
    widths = data.frame(
        id=genes$ID,
        width=NA)

    widths$width = utr_lengths[match(genes$ID, utr_lengths$name),]$length
    widths$width[is.na(widths$width)] = default_width

    # get positive and negative strand genes
    if (utr5) { 
        utr = flank(genes, widths$width)
    } else {
        utr = flank(genes, widths$width, start=FALSE)
    }
    pos_strand = utr[as.character(strand(utr)) == "+"]
    neg_strand = utr[as.character(strand(utr)) == "-"]

    # for genes that were assigned the default UTR size, make sure that the                                                                                                               
    # assigned boundaries fall within the chromosome                                                                                                                                      
    start(pos_strand) = pmax(1, start(pos_strand))                                                                                                                                        
    start(neg_strand) = pmax(1, start(neg_strand))                                                                                                                                        

    end(pos_strand) = pmin(end(pos_strand), width(fasta[seqnames(pos_strand)]))                                                                                                           
    end(neg_strand) = pmin(end(neg_strand), width(fasta[seqnames(neg_strand)]))  

    seqs = fasta[pos_strand]
    seqs = append(seqs, reverseComplement(fasta[neg_strand]))
    names(seqs) = c(pos_strand$ID, neg_strand$ID)

    return(seqs)
}

#
# get_num_reads
#
# Returns the number of reads mapped to a given position for the specified
# stage, or 0 if none are found
#
get_num_reads = function(utr_reads, site) {
    if (is.na(site)) {
        return(NA)
    }
    num_reads = score(utr_reads[start(utr_reads) == site])
    return(ifelse(length(num_reads) == 0, 0, num_reads))
}</code></pre>
</div>
<h3>Compute UTR coordinates and features</h3>
<h4>5'UTR</h4>
<h5>5'UTR Length</h5>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Output columns
procyclic_lengths             = c()
procyclic_num_reads           = c()
procyclic_num_reads_primary   = c()
procyclic_num_reads_secondary = c()

metacyclic_lengths             = c()
metacyclic_num_reads           = c()
metacyclic_num_reads_primary   = c()
metacyclic_num_reads_secondary = c()

amastigote_lengths             = c()
amastigote_num_reads           = c()
amastigote_num_reads_primary   = c()
amastigote_num_reads_secondary = c()

combined_lengths             = c()
combined_num_reads           = c()
combined_num_reads_primary   = c()
combined_num_reads_secondary = c()

# Vectors to keep track of stage-specific primary site coverage across
# different stages (e.g. amastigote primary site reads in procyclic samples)
# This will be useful later on when looking for evidence of alternative trans-
# splicing and polyadenylation.
#
# For example, "pro_meta_primary_sl_reads" will contain the number of
# metacyclic reads mapped to the metacyclic primary site.
#
# Note 2015/01/15 -- for now, we will use the simpler "primary to secondary"
# ratios *within* each condition to give a sense of the degree of "dominance"
# for a given primary site.
#
pro_meta_primary_sl_reads = c()
pro_amast_primary_sl_reads = c()
meta_pro_primary_sl_reads = c()
meta_amast_primary_sl_reads = c()
amast_pro_primary_sl_reads = c()
amast_meta_primary_sl_reads = c()

# Start GFF output for combined set of acceptor sites
gff_lines = c("##gff-version\t3",
              "##feature-ontology\tsofa.obo",
              "##attribute-ontology\tgff3_attributes.obo")

# Add chromosome entries
for (i in 1:length(chromosomes)) {
    ch = chromosomes[i]
    gff_lines = append(gff_lines, paste("##sequence-region", ch$Name, 1,
                                        ch$size, sep='\t'))
}

# GFF chromosome entries
#for (i in 1:length(chromosomes)) {
#    ch = chromosomes[i]
#    # ID=LmjF.01;Name=LmjF.01;description=LmjF.01;size=268988;web_id=LmjF.01;
#    # molecule_type=dsDNA;organism_name=Leishmania
#    #  major;translation_table=1;topology=linear;localization=nuclear;
#    # Dbxref=ApiDB:LmjF.01,taxon:347515

#    descr_template = paste0(
#        "ID=%s;Name=%s;description=%s;size=%s;web_id=%s;",
#        "molecule_type=%s;organism_name=%s;translation_table=%s",
#        "topology=%s;localization=%s;Dbxref=%s")
        
#    descr = sprintf(descr_template,
#                    ch$ID, ch$Name, ch$description, ch$size, ch$web_id,
#                    ch$molecule_type, ch$organism_name, ch$translation_table,
#                    ch$topology, ch$localization, 
    
#                    paste(ch$Dbxref[[1]][1], ch$Dbxref[[1]][2], sep=','))
#    gff_lines = append(gff_lines, paste(ch$Name, "TriTrypDB", "chromosome", 1,
#                                        ch$size, ".", "+", ".", descr,
#                                        sep='\t'))
#}

# keep track of the number of plots created
num_diff = 0
plot_num = 0

# Add gene entries
i = 1
for (gene_id in genes$ID) {
    message(sprintf("Processing SL sites for gene %d/%d", i, length(genes)))
    gene = genes[genes$ID == gene_id]

    gene_strand = as.character(strand(gene)) 
    # get all of the SL acceptor sites as a GRanges object
    metacyclic_utr5 = metacyclic_sl[metacyclic_sl$Name == gene_id]
    procyclic_utr5  = procyclic_sl[procyclic_sl$Name == gene_id]
    amastigote_utr5  = amastigote_sl[amastigote_sl$Name == gene_id]
    combined_utr5   = metacyclic_sl[metacyclic_sl$Name == gene_id]

    # total number of reads found which contain an acceptor site
    metacyclic_num_reads = append(metacyclic_num_reads, 
                                  sum(metacyclic_utr5$score))
    procyclic_num_reads  = append(procyclic_num_reads,
                                  sum(procyclic_utr5$score))
    amastigote_num_reads  = append(amastigote_num_reads,
                                  sum(amastigote_utr5$score))
    combined_num_reads   = append(combined_num_reads,
                                  sum(procyclic_utr5$score) +
                                  sum(metacyclic_utr5$score) +
                                  sum(amastigote_utr5$score))
    
    # Combined output

    # Add procyclic reads
    if (length(procyclic_utr5) > 0) {
        for (j in 1:length(procyclic_utr5)) {
            # if new site, add a new entry
            entry = procyclic_utr5[j]
            if(!start(entry) %in% start(combined_utr5)) {
                combined_utr5 = c(combined_utr5, entry)
            }
            # otherwise add procyclic score to existing metacyclic score
            else {
                score(combined_utr5[start(combined_utr5) == start(entry)]) = (
                    score(combined_utr5[start(combined_utr5) == start(entry)]) +
                    score(entry))
            }
        }
    }
    # Add amastigote reads
    if (length(amastigote_utr5) > 0) {
        for (j in 1:length(amastigote_utr5)) {
            # if new site, add a new entry
            entry = amastigote_utr5[j]
            if(!start(entry) %in% start(combined_utr5)) {
                combined_utr5 = c(combined_utr5, entry)
            }
            # otherwise add amastigote score to existing metacyclic score
            else {
                score(combined_utr5[start(combined_utr5) == start(entry)]) = (
                    score(combined_utr5[start(combined_utr5) == start(entry)]) +
                    score(entry))
            }
        }
    }
    i = i + 1

    # Determine length and scores for procyclic, metacyclic, amastigote, 
    # and combined outputs
    pro_primary_site      = find_primary_site(procyclic_utr5, 'sl', gene, gene_strand)
    meta_primary_site     = find_primary_site(metacyclic_utr5, 'sl', gene, gene_strand)
    amast_primary_site    = find_primary_site(amastigote_utr5, 'sl', gene, gene_strand)
    combined_primary_site = find_primary_site(combined_utr5, 'sl', gene, gene_strand)

    # secondary site
    pro_secondary_site      = find_secondary_site(procyclic_utr5, 'sl', gene, gene_strand)
    meta_secondary_site     = find_secondary_site(metacyclic_utr5, 'sl', gene, gene_strand)
    amast_secondary_site    = find_secondary_site(amastigote_utr5, 'sl', gene, gene_strand)
    combined_secondary_site = find_secondary_site(combined_utr5, 'sl', gene, gene_strand)

    # compute 5'utr length and coordinates
    if (gene_strand == '+') {
        # procylic + strand
        procyclic_utr5_length = start(gene) - pro_primary_site$location

        # metacyclic + strand
        metacyclic_utr5_length = start(gene) - meta_primary_site$location

        # amastigote + strand
        amastigote_utr5_length = start(gene) - amast_primary_site$location

        # combined + strand
        combined_utr5_start  = combined_primary_site$location + 1
        combined_utr5_end    = start(gene) - 1
        combined_utr5_length = start(gene) - combined_primary_site$location
    } else {
        # procyclic - strand
        procyclic_utr5_length = pro_primary_site$location - end(gene)

        # metacyclic - strand
        metacyclic_utr5_length = meta_primary_site$location - end(gene)

        # procyclic - strand
        amastigote_utr5_length = amast_primary_site$location - end(gene)
        
        # combined - strand
        combined_utr5_start  = end(gene) + 1
        combined_utr5_end    = combined_primary_site$location - 1
        combined_utr5_length = combined_primary_site$location - end(gene)
    }

    # Add primary site read count and UTR length
    procyclic_lengths  = append(procyclic_lengths,  procyclic_utr5_length)
    metacyclic_lengths = append(metacyclic_lengths, metacyclic_utr5_length)
    amastigote_lengths = append(amastigote_lengths, amastigote_utr5_length)
    combined_lengths   = append(combined_lengths,   combined_utr5_length)

    procyclic_num_reads_primary  = append(procyclic_num_reads_primary,
                                          pro_primary_site$num_reads)
    metacyclic_num_reads_primary = append(metacyclic_num_reads_primary,
                                          meta_primary_site$num_reads)
    amastigote_num_reads_primary = append(amastigote_num_reads_primary,
                                          amast_primary_site$num_reads)
    combined_num_reads_primary   = append(combined_num_reads_primary,
                                          combined_primary_site$num_reads)

    procyclic_num_reads_secondary  = append(procyclic_num_reads_secondary,
                                            pro_secondary_site$num_reads)
    metacyclic_num_reads_secondary = append(metacyclic_num_reads_secondary,
                                            meta_secondary_site$num_reads)
    amastigote_num_reads_secondary = append(amastigote_num_reads_secondary,
                                            amast_secondary_site$num_reads)
    combined_num_reads_secondary   = append(combined_num_reads_secondary,
                                            combined_secondary_site$num_reads)

    # Update counts for cross-stage primary sites
    pro_meta_primary_sl_reads = append(pro_meta_primary_sl_reads,
                                 get_num_reads(procyclic_utr5, meta_primary_site$location))
    pro_amast_primary_sl_reads = append(pro_amast_primary_sl_reads,
                                 get_num_reads(procyclic_utr5, amast_primary_site$location))
    meta_pro_primary_sl_reads = append(meta_pro_primary_sl_reads,
                                 get_num_reads(metacyclic_utr5, pro_primary_site$location))
    meta_amast_primary_sl_reads = append(meta_amast_primary_sl_reads,
                                 get_num_reads(metacyclic_utr5, amast_primary_site$location))
    amast_pro_primary_sl_reads = append(amast_pro_primary_sl_reads,
                                 get_num_reads(amastigote_utr5, pro_primary_site$location))
    amast_meta_primary_sl_reads = append(amast_meta_primary_sl_reads,
                                 get_num_reads(amastigote_utr5, meta_primary_site$location))

    # Add GFF entry
    descr = sprintf("ID=%s_5utr;Name=%s;description=%s", gene$ID, gene$ID,
                    gene$description)

    gff_entry = paste(
        seqnames(gene),
        "El-Sayed",
        "five_prime_UTR",
        combined_utr5_start,
        combined_utr5_end,
        combined_primary_site$num_reads,
        strand(gene),
        '.',
        descr, sep='\t')

    gff_lines = append(gff_lines, gff_entry)
}

# metacyclic
metacyclic_utr5_df = data.frame(
    name=genes$ID,
    length=metacyclic_lengths,
    num_reads=metacyclic_num_reads,
    num_reads_primary=metacyclic_num_reads_primary,
    num_reads_secondary=metacyclic_num_reads_secondary
)

# procyclic
procyclic_utr5_df = data.frame(
    name=genes$ID,
    length=procyclic_lengths,
    num_reads=procyclic_num_reads,
    num_reads_primary=procyclic_num_reads_primary,
    num_reads_secondary=procyclic_num_reads_secondary
)

# amastigote
amastigote_utr5_df = data.frame(
    name=genes$ID,
    length=amastigote_lengths,
    num_reads=amastigote_num_reads,
    num_reads_primary=amastigote_num_reads_primary,
    num_reads_secondary=amastigote_num_reads_secondary
)

# combined
combined_utr5_df = data.frame(
    name=genes$ID,
    length=combined_lengths,
    num_reads=combined_num_reads,
    num_reads_primary=combined_num_reads_primary,
    num_reads_secondary=combined_num_reads_secondary
)

# Create cross-stage site usage dataframes
# For example, "pro_other_sl_sites"  lists the number of procyclic reads 
# which mapped to the primary sites detected for other stages (amast and meta).
pro_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   metacyclic=pro_meta_primary_sl_reads,
   amastigote=pro_amast_primary_sl_reads
))

meta_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=meta_pro_primary_sl_reads,
   amastigote=meta_amast_primary_sl_reads
))

amast_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=amast_pro_primary_sl_reads,
   metacyclic=amast_meta_primary_sl_reads
))</code></pre>
</div>
<h5>5'UTR Composition</h5>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Metacyclic
metacyclic_utr5_sequences = get_utr_sequences(genes, fasta, metacyclic_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(metacyclic_utr5_sequences)[,1:4] 
metacyclic_utr5_features_df = data.frame(
    name=names(metacyclic_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
metacyclic_utr5_df = merge(metacyclic_utr5_df, metacyclic_utr5_features_df,
                           by='name')

# Procyclic
procyclic_utr5_sequences = get_utr_sequences(genes, fasta, procyclic_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(procyclic_utr5_sequences)[,1:4] 
procyclic_utr5_features_df = data.frame(
    name=names(procyclic_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
procyclic_utr5_df = merge(procyclic_utr5_df, procyclic_utr5_features_df,
                          by='name')

# amastigote
amastigote_utr5_sequences = get_utr_sequences(genes, fasta, amastigote_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(amastigote_utr5_sequences)[,1:4] 
amastigote_utr5_features_df = data.frame(
    name=names(amastigote_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
amastigote_utr5_df = merge(amastigote_utr5_df, amastigote_utr5_features_df,
                           by='name')

# Combined
combined_utr5_sequences = get_utr_sequences(genes, fasta, combined_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(combined_utr5_sequences)[,1:4] 
combined_utr5_features_df = data.frame(
    name=names(combined_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
combined_utr5_df = merge(combined_utr5_df, combined_utr5_features_df,
                         by='name')</code></pre>
</div>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Write results
write.csv(metacyclic_utr5_df, file='output/lmajor_metacyclic_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(procyclic_utr5_df, file='output/lmajor_procyclic_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(amastigote_utr5_df, file='output/lmajor_amastigote_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(combined_utr5_df, file='output/lmajor_combined_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)

# Write result to GFF
fp = file("output/lmajor_5utr.gff")
writeLines(gff_lines, fp)
close(fp)</code></pre>
</div>
<h4>3'UTR</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Output columns
procyclic_lengths           = c()
procyclic_num_reads         = c()
procyclic_num_reads_primary = c()
procyclic_num_reads_secondary = c()

metacyclic_lengths           = c()
metacyclic_num_reads         = c()
metacyclic_num_reads_primary = c()
metacyclic_num_reads_secondary = c()

amastigote_lengths           = c()
amastigote_num_reads         = c()
amastigote_num_reads_primary = c()
amastigote_num_reads_secondary = c()

combined_lengths           = c()
combined_num_reads         = c()
combined_num_reads_primary = c()
combined_num_reads_secondary = c()

# Vectors to keep track of stage-specific primary site coverage across
# different stages (e.g. amastigote primary site reads in procyclic samples)
pro_meta_primary_polya_reads = c()
pro_amast_primary_polya_reads = c()
meta_pro_primary_polya_reads = c()
meta_amast_primary_polya_reads = c()
amast_pro_primary_polya_reads = c()
amast_meta_primary_polya_reads = c()

# Start GFF output for combined set of acceptor sites
gff_lines = c("##gff-version\t3",
             "##feature-ontology\tsofa.obo",
             "##attribute-ontology\tgff3_attributes.obo")

# Add chromosome entries
for (i in 1:length(chromosomes)) {
    ch = chromosomes[i]
    gff_lines = append(gff_lines, paste("##sequence-region", ch$Name, 1,
                                        ch$size, sep='\t'))
}

# GFF chromosome entries
#for (i in 1:length(chromosomes)) {
#    ch = chromosomes[i]
#    # ID=LmjF.01;Name=LmjF.01;description=LmjF.01;size=268988;web_id=LmjF.01;
#    # molecule_type=dsDNA;organism_name=Leishmania
#    #  major;translation_table=1;topology=linear;localization=nuclear;
#    # Dbxref=ApiDB:LmjF.01,taxon:347515

#    descr_template = paste0(
#        "ID=%s;Name=%s;description=%s;size=%s;web_id=%s;",
#        "molecule_type=%s;organism_name=%s;translation_table=%s",
#        "topology=%s;localization=%s;Dbxref=%s")
        
#    descr = sprintf(descr_template,
#                    ch$ID, ch$Name, ch$description, ch$size, ch$web_id,
#                    ch$molecule_type, ch$organism_name, ch$translation_table,
#                    ch$topology, ch$localization, 
    
#                    paste(ch$Dbxref[[1]][1], ch$Dbxref[[1]][2], sep=','))
#    gff_lines = append(gff_lines, paste(ch$Name, "TriTrypDB", "chromosome", 1,
#                                        ch$size, ".", "+", ".", descr,
#                                        sep='\t'))
#}

# keep track of the number of plots created
num_diff = 0
plot_num = 0

# Add gene entries
i = 1
for (gene_id in genes$ID) {
    message(sprintf("Processing Poly(A) sites for gene %d/%d", i, length(genes)))
    gene = genes[genes$ID == gene_id]

    gene_strand = as.character(strand(gene)) 

    # get all of the Poly(A) acceptor sites as a GRanges object
    metacyclic_utr3 = metacyclic_polya[metacyclic_polya$Name == gene_id]
    procyclic_utr3  = procyclic_polya[procyclic_polya$Name == gene_id]
    amastigote_utr3 = amastigote_polya[amastigote_polya$Name == gene_id]
    combined_utr3   = metacyclic_polya[metacyclic_polya$Name == gene_id]

    # total number of reads found which contain an acceptor site
    metacyclic_num_reads = append(metacyclic_num_reads, 
                                  sum(metacyclic_utr3$score))
    procyclic_num_reads  = append(procyclic_num_reads,
                                  sum(procyclic_utr3$score))
    amastigote_num_reads  = append(amastigote_num_reads,
                                  sum(amastigote_utr3$score))
    combined_num_reads   = append(combined_num_reads,
                                  sum(procyclic_utr3$score) +
                                  sum(metacyclic_utr3$score) +
                                  sum(amastigote_utr3$score))
    
    # Combined output
    if (length(procyclic_utr3) > 0) {
        for (j in 1:length(procyclic_utr3)) {
            # if new site, add a new entry
            entry = procyclic_utr3[j]
            if(!start(entry) %in% start(combined_utr3)) {
                combined_utr3 = c(combined_utr3, entry)
            }
            # otherwise add procyclic score to existing metacyclic score
            else {
                score(combined_utr3[start(combined_utr3) == start(entry)]) = (
                    score(combined_utr3[start(combined_utr3) == start(entry)]) +
                    score(entry))
            }
        }
    }
    if (length(amastigote_utr3) > 0) {
        for (j in 1:length(amastigote_utr3)) {
            # if new site, add a new entry
            entry = amastigote_utr3[j]
            if(!start(entry) %in% start(combined_utr3)) {
                combined_utr3 = c(combined_utr3, entry)
            }
            # otherwise add amastigote score to existing metacyclic score
            else {
                score(combined_utr3[start(combined_utr3) == start(entry)]) = (
                    score(combined_utr3[start(combined_utr3) == start(entry)]) +
                    score(entry))
            }
        }
    }
    i = i + 1

    # Determine length and scores for procyclic, metacyclic, and combined
    # outputs
    pro_primary_site      = find_primary_site(procyclic_utr3, 'polya', gene, gene_strand)
    meta_primary_site     = find_primary_site(metacyclic_utr3, 'polya', gene, gene_strand)
    amast_primary_site    = find_primary_site(amastigote_utr3, 'polya', gene, gene_strand)
    combined_primary_site = find_primary_site(combined_utr3, 'polya', gene, gene_strand)

    # secondary sites
    pro_secondary_site      = find_secondary_site(procyclic_utr3, 'polya', gene, gene_strand)
    meta_secondary_site     = find_secondary_site(metacyclic_utr3, 'polya', gene, gene_strand)
    amast_secondary_site    = find_secondary_site(amastigote_utr3, 'polya', gene, gene_strand)
    combined_secondary_site = find_secondary_site(combined_utr3, 'polya', gene, gene_strand)

    # compute 3'utr length and coordinates
    if (gene_strand == '+') {
        # procylic + strand
        procyclic_utr3_length = pro_primary_site$location - end(gene)

        # metacyclic + strand
        metacyclic_utr3_length = meta_primary_site$location - end(gene)
        
        # amastigote + strand
        amastigote_utr3_length = amast_primary_site$location - end(gene)

        # combined + strand
        combined_utr3_start  = end(gene) + 1
        combined_utr3_end    = combined_primary_site$location - 1
        combined_utr3_length = combined_primary_site$location - end(gene)
    } else {
        # procyclic - strand
        procyclic_utr3_length = start(gene) - pro_primary_site$location

        # metacyclic - strand
        metacyclic_utr3_length = start(gene) - meta_primary_site$location

        # amastigote - strand
        amastigote_utr3_length = start(gene) - amast_primary_site$location
        
        # combined - strand
        combined_utr3_start  = combined_primary_site$location + 1
        combined_utr3_end    = start(gene) - 1
        combined_utr3_length = start(gene) - combined_primary_site$location
    }

    # Add primary site read count and UTR length
    procyclic_lengths  = append(procyclic_lengths,  procyclic_utr3_length)
    metacyclic_lengths = append(metacyclic_lengths, metacyclic_utr3_length)
    amastigote_lengths = append(amastigote_lengths, amastigote_utr3_length)
    combined_lengths   = append(combined_lengths,   combined_utr3_length)

    procyclic_num_reads_primary  = append(procyclic_num_reads_primary,
                                          pro_primary_site$num_reads)
    metacyclic_num_reads_primary = append(metacyclic_num_reads_primary,
                                          meta_primary_site$num_reads)
    amastigote_num_reads_primary = append(amastigote_num_reads_primary,
                                          amast_primary_site$num_reads)
    combined_num_reads_primary   = append(combined_num_reads_primary,
                                          combined_primary_site$num_reads)

    procyclic_num_reads_secondary  = append(procyclic_num_reads_secondary,
                                          pro_secondary_site$num_reads)
    metacyclic_num_reads_secondary = append(metacyclic_num_reads_secondary,
                                          meta_secondary_site$num_reads)
    amastigote_num_reads_secondary = append(amastigote_num_reads_secondary,
                                          amast_secondary_site$num_reads)
    combined_num_reads_secondary   = append(combined_num_reads_secondary,
                                          combined_secondary_site$num_reads)

    # Update counts for cross-stage primary sites
    pro_meta_primary_polya_reads = append(pro_meta_primary_polya_reads,
                                 get_num_reads(procyclic_utr5, meta_primary_site$location))
    pro_amast_primary_polya_reads = append(pro_amast_primary_polya_reads,
                                 get_num_reads(procyclic_utr5, amast_primary_site$location))
    meta_pro_primary_polya_reads = append(meta_pro_primary_polya_reads,
                                 get_num_reads(metacyclic_utr5, pro_primary_site$location))
    meta_amast_primary_polya_reads = append(meta_amast_primary_polya_reads,
                                 get_num_reads(metacyclic_utr5, amast_primary_site$location))
    amast_pro_primary_polya_reads = append(amast_pro_primary_polya_reads,
                                 get_num_reads(amastigote_utr5, pro_primary_site$location))
    amast_meta_primary_polya_reads = append(amast_meta_primary_polya_reads,
                                 get_num_reads(amastigote_utr5, meta_primary_site$location))

    # Add GFF entry
    descr = sprintf("ID=%s_3utr;Name=%s;description=%s", gene$ID, gene$ID,
                    gene$description)

    gff_entry = paste(
        seqnames(gene),
        "El-Sayed",
        "three_prime_UTR",
        combined_utr3_start,
        combined_utr3_end,
        combined_primary_site$num_reads,
        strand(gene),
        '.',
        descr, sep='\t')

    gff_lines = append(gff_lines, gff_entry)
}

# metacyclic
metacyclic_utr3_df = data.frame(
    name=genes$ID,
    length=metacyclic_lengths,
    num_reads=metacyclic_num_reads,
    num_reads_primary=metacyclic_num_reads_primary,
    num_reads_secondary=metacyclic_num_reads_secondary
)

# procyclic
procyclic_utr3_df = data.frame(
    name=genes$ID,
    length=procyclic_lengths,
    num_reads=procyclic_num_reads,
    num_reads_primary=procyclic_num_reads_primary,
    num_reads_secondary=procyclic_num_reads_secondary
)

# amastigote
amastigote_utr3_df = data.frame(
    name=genes$ID,
    length=amastigote_lengths,
    num_reads=amastigote_num_reads,
    num_reads_primary=amastigote_num_reads_primary,
    num_reads_secondary=amastigote_num_reads_secondary
)

# combined
combined_utr3_df = data.frame(
    name=genes$ID,
    length=combined_lengths,
    num_reads=combined_num_reads,
    num_reads_primary=combined_num_reads_primary,
    num_reads_secondary=combined_num_reads_secondary
)

# Create cross-stage site usage dataframes
pro_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   metacyclic=pro_meta_primary_polya_reads,
   amastigote=pro_amast_primary_polya_reads
))

meta_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=meta_pro_primary_polya_reads,
   amastigote=meta_amast_primary_polya_reads
))

amast_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=amast_pro_primary_polya_reads,
   metacyclic=amast_meta_primary_polya_reads
))</code></pre>
</div>
<h5>3'UTR Composition</h5>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Metacyclic
metacyclic_utr3_sequences = get_utr_sequences(genes, fasta, metacyclic_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(metacyclic_utr3_sequences)[,1:4] 
metacyclic_utr3_features_df = data.frame(
    name=names(metacyclic_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
metacyclic_utr3_df = merge(metacyclic_utr3_df, metacyclic_utr3_features_df,
                           by='name')

# Procyclic
procyclic_utr3_sequences = get_utr_sequences(genes, fasta, procyclic_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(procyclic_utr3_sequences)[,1:4] 
procyclic_utr3_features_df = data.frame(
    name=names(procyclic_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
procyclic_utr3_df = merge(procyclic_utr3_df, procyclic_utr3_features_df,
                          by='name')

# amastigote
amastigote_utr3_sequences = get_utr_sequences(genes, fasta, amastigote_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(amastigote_utr3_sequences)[,1:4] 
amastigote_utr3_features_df = data.frame(
    name=names(amastigote_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
amastigote_utr3_df = merge(amastigote_utr3_df, amastigote_utr3_features_df,
                          by='name')
# Combined
combined_utr3_sequences = get_utr_sequences(genes, fasta, combined_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(combined_utr3_sequences)[,1:4] 
combined_utr3_features_df = data.frame(
    name=names(combined_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
combined_utr3_df = merge(combined_utr3_df, combined_utr3_features_df,
                         by='name')</code></pre>
</div>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># Write results
write.csv(metacyclic_utr3_df, file='output/lmajor_metacyclic_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(procyclic_utr3_df, file='output/lmajor_procyclic_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(amastigote_utr3_df, file='output/lmajor_amastigote_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(combined_utr3_df, file='output/lmajor_combined_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)

# Write result to GFF
fp = file("output/lmajor_3utr.gff")
writeLines(gff_lines, fp)
close(fp)</code></pre>
</div>
<h2>Results</h2>
<h3>Length statistics</h3>
<h4>5'UTR length statistics</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">lengths = combined_utr5_df$length[!is.na(combined_utr5_df$length)]

coverage = sum(!is.na(combined_utr5_df$length)) / nrow(combined_utr5_df)
print(sprintf("%% 5'UTRs detected: %0.2f", coverage))</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "% 5'UTRs detected: 0.95"
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">quantile(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">##   0%  25%  50%  75% 100% 
##    1  114  234  501 6018
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">mean(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] 447.6343
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">median(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] 234
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">mode(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "numeric"
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">hist(lengths)</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
</div>
<h4>3'UTR length statistics</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">lengths = combined_utr3_df$length[!is.na(combined_utr3_df$length)]

coverage = sum(!is.na(combined_utr3_df$length)) / nrow(combined_utr3_df)
print(sprintf("%% 3'UTRs detected: %0.2f", coverage))</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "% 3'UTRs detected: 0.94"
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">quantile(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">##   0%  25%  50%  75% 100% 
##    1  268  529 1078 6722
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">mean(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] 843.9933
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">median(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] 529
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">mode(lengths)</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "numeric"
</code></pre>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">hist(lengths)</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
</div>
<h3>Alternative trans-splicing</h3>
<h4>Metacyclic vs. Procyclic</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># rescale to range 0-1
# if trimmed=TRUE, outliers will be trimmed first to reduce their influence on
# the resulting colorscale
rescale = function (x, trimmed=TRUE, trim_quantile=0.99) {
    if (trimmed) {
        lower_bound = quantile(x, 1 - trim_quantile)
        upper_bound = quantile(x, trim_quantile)
   
        x = pmax(pmin(x, upper_bound), lower_bound)

        return(pmax(0, ((x-min(x)) / (max(x) - min(x)))))
    }
    # no trimming
    return(pmax(0, ((x-min(x)) / (max(x) - min(x)))))
}

# metacyclic vs. procyclic
# For the purposes of comparing usage ratios, we will treat non-covered sites
# as having one read mapped
meta_pro_5utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr5_df$name,
    metacyclic_len=metacyclic_utr5_df$length,
    procyclic_len=procyclic_utr5_df$length,
    average_primary_site_num_reads=(metacyclic_utr5_df$num_reads_primary +
                                    procyclic_utr5_df$num_reads_primary) / 2,
    average_ptos_ratio=((metacyclic_utr5_df$num_reads_primary /
                         metacyclic_utr5_df$num_reads_secondary) + 
                        (procyclic_utr5_df$num_reads_primary /
                         procyclic_utr5_df$num_reads_secondary)) / 2,
    count_average=(metacyclic_utr5_df$num_reads_primary +
                   procyclic_utr5_df$num_reads_primary) / 2,
    meta_pro_ratio_meta_samples=(metacyclic_utr5_df$num_reads_primary /
                           pmax(meta_other_sl_sites$procyclic, 1)),
    pro_meta_ratio_pro_samples=(procyclic_utr5_df$num_reads_primary / 
                          pmax(pro_other_sl_sites$metacyclic, 1))
))

# version with reads mapped for stages of interest
meta_pro_5utr_comparison_complete = meta_pro_5utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(procyclic_len)) %>%
    mutate(
        log_average_ratio=log2(0.5 * (meta_pro_ratio_meta_samples + pro_meta_ratio_pro_samples)),
        length_diff=abs(metacyclic_len - procyclic_len), 
        log_average_reads=log2(average_primary_site_num_reads),
        log_average_ptos=log2(average_ptos_ratio),
        log_length_diff=log2(length_diff)
    )

ggplot(meta_pro_5utr_comparison_complete, 
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=average_primary_site_num_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666', lwd=0.5) +
    geom_abline(slope=1, intercept=-300, color='#666666', lwd=0.5) +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 
    #scale_size(trans='log')

# sites with switching only
ggplot(meta_pro_5utr_comparison_complete %>% filter(metacyclic_len != procyclic_len),
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666', lwd=0.5) +
    geom_abline(slope=1, intercept=-300, color='#666666', lwd=0.5) +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># print top 25 genes with strongest site switching behavior
meta_pro_5utr_diff = meta_pro_5utr_comparison_complete %>% 
    filter(count_average & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_pro_5utr_diff  %>% arrange(-score), 25))

print(sprintf("Number of alternatively trans-spliced genes (meta vs. pro): %d",
              nrow(meta_pro_5utr_diff)))</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "Number of alternatively trans-spliced genes (meta vs. pro): 3035"
</code></pre>
</div>
<h4>Metacyclic vs. Amastigote</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># metacyclic vs. amastigote
meta_amast_5utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr5_df$name,
    metacyclic_len=metacyclic_utr5_df$length,
    amastigote_len=amastigote_utr5_df$length,
    average_primary_site_num_reads=abs(metacyclic_utr5_df$num_reads_primary -
                   amastigote_utr5_df$num_reads_primary),
    average_ptos_ratio=((metacyclic_utr5_df$num_reads_primary /
                         metacyclic_utr5_df$num_reads_secondary) + 
                        (amastigote_utr5_df$num_reads_primary /
                         amastigote_utr5_df$num_reads_secondary)) / 2,
    meta_amast_ratio_meta_samples=(metacyclic_utr5_df$num_reads_primary /
                           pmax(meta_other_sl_sites$amastigote, 1)),
    amast_meta_ratio_amast_samples=(amastigote_utr5_df$num_reads_primary / 
                          pmax(amast_other_sl_sites$metacyclic, 1))
)) %>% mutate(
    length_diff=abs(metacyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
meta_amast_5utr_comparison_complete = meta_amast_5utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (meta_amast_ratio_meta_samples +
                                          amast_meta_ratio_amast_samples)))

#meta_amast_5utr_comparison_complete %>% 
#    ggvis(~metacyclic_len, ~amastigote_len, fill=~log_average_reads) %>% 
#    layer_points() %>% 
#    add_tooltip(function(df) df$name)
ggplot(meta_amast_5utr_comparison_complete, 
       aes(metacyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") + 
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
meta_amast_5utr_diff = meta_amast_5utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_amast_5utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (meta vs. amast): %d",
              nrow(meta_amast_5utr_diff)))</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "Number of alternatively trans-spliced genes (meta vs. amast): 105"
</code></pre>
</div>
<h4>Procyclic vs. Amastigote</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># procyclic vs. amastigote
pro_amast_5utr_comparison = tbl_df(data.frame(
    name=procyclic_utr5_df$name,
    procyclic_len=procyclic_utr5_df$length,
    amastigote_len=amastigote_utr5_df$length,
    average_primary_site_num_reads=abs(procyclic_utr5_df$num_reads_primary -
                   amastigote_utr5_df$num_reads_primary),
    average_ptos_ratio=((procyclic_utr5_df$num_reads_primary /
                         procyclic_utr5_df$num_reads_secondary) + 
                        (amastigote_utr5_df$num_reads_primary /
                         amastigote_utr5_df$num_reads_secondary)) / 2,
    pro_amast_ratio_pro_samples=(procyclic_utr5_df$num_reads_primary /
                          pmax(pro_other_sl_sites$amastigote, 1)),
    amast_pro_ratio_amast_samples=(amastigote_utr5_df$num_reads_primary / 
                            pmax(amast_other_sl_sites$procyclic, 1))
)) %>% mutate(
    length_diff=abs(procyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
pro_amast_5utr_comparison_complete = pro_amast_5utr_comparison %>%
    filter(!is.na(procyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (pro_amast_ratio_pro_samples +
                                          amast_pro_ratio_amast_samples)))

ggplot(pro_amast_5utr_comparison_complete, 
       aes(procyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") + 
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="" /> </a>
</div>
</div>
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r">    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
pro_amast_5utr_diff = pro_amast_5utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(pro_amast_5utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (pro vs. amast): %d",
              nrow(pro_amast_5utr_diff)))</code></pre>
<button class="output R toggle btn btn-xs btn-success">
<span class="glyphicon glyphicon-chevron-down"></span> R output
</button>
<pre style=""><code class="output r">## [1] "Number of alternatively trans-spliced genes (pro vs. amast): 90"
</code></pre>
</div>
<h3>Alternative poly-adenylation</h3>
<h4>Metacyclic vs. Procyclic</h4>
<div class="row">
<button class="source R toggle btn btn-xs btn-primary">
<span class="glyphicon glyphicon-chevron-down"></span> R source
</button>
<pre style=""><code class="source r"># metacyclic vs. procyclic
# For the purposes of comparing usage ratios, we will treat non-covered sites
# as having one read mapped
meta_pro_3utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr3_df$name,
    metacyclic_len=metacyclic_utr3_df$length,
    procyclic_len=procyclic_utr3_df$length,
    count_average=(metacyclic_utr3_df$num_reads_primary +
                   procyclic_utr3_df$num_reads_primary) / 2,
    average_ptos_ratio=((metacyclic_utr3_df$num_reads_primary /
                         metacyclic_utr3_df$num_reads_secondary) + 
                        (procyclic_utr3_df$num_reads_primary /
                         procyclic_utr3_df$num_reads_secondary)) / 2,
    average_primary_site_num_reads=abs(metacyclic_utr3_df$num_reads_primary -
                   procyclic_utr3_df$num_reads_primary),
    meta_pro_ratio_meta_samples=(metacyclic_utr3_df$num_reads_primary /
                           pmax(meta_other_polya_sites$procyclic, 1)),
    pro_meta_ratio_pro_samples=(procyclic_utr3_df$num_reads_primary / 
                          pmax(pro_other_polya_sites$metacyclic, 1))
))

# version with reads mapped for stages of interest
meta_pro_3utr_comparison_complete = meta_pro_3utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(procyclic_len)) %>%
    mutate(
        log_average_ratio=log2(0.5 * (meta_pro_ratio_meta_samples + pro_meta_ratio_pro_samples)),
        length_diff=abs(metacyclic_len - procyclic_len), 
        log_average_reads=log2(average_primary_site_num_reads),
        log_average_ptos=log2(average_ptos_ratio),
        log_length_diff=log2(length_diff)
    )

ggplot(meta_pro_3utr_comparison_complete, 
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=average_primary_site_num_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))</code></pre>
<div class="row">
<div class="col-md-offset-3 col-md-6">
<a href="#" class="thumbnail"><img src="

About

Analysis comparing the use of alternative trans-splicing and alternative-polyadenylation sites across developmental stages in Leishmania major

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages