# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Burrows-Wheeler-Transformation-problem" data-toc-modified-id="Burrows-Wheeler-Transformation-problem-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Burrows-Wheeler Transformation problem</a></div><div class="lev2 toc-item"><a href="#Check-the-functions-work-as-implemented:" data-toc-modified-id="Check-the-functions-work-as-implemented:-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Check the functions work as implemented:</a></div><div class="lev2 toc-item"><a href="#Invert-the-BT-transformed-string-given-in-the-homework:" data-toc-modified-id="Invert-the-BT-transformed-string-given-in-the-homework:-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Invert the BT-transformed string given in the homework:</a></div><div class="lev1 toc-item"><a href="#Counting-Human-Genome-Stuff" data-toc-modified-id="Counting-Human-Genome-Stuff-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Counting Human Genome Stuff</a></div>

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO

# Graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})

# Magic function to make matplotlib inline;
%matplotlib inline

# This enables SVG graphics inline. 
# There is a bug, so uncomment if it works.
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")

mpl.rcParams['xtick.labelsize'] = 16 
mpl.rcParams['ytick.labelsize'] = 16 
mpl.rcParams['legend.fontsize'] = 14

# Burrows-Wheeler Transformation problem

In [2]:
def BT(S):
    """Given a string, finds its Burrows-Wheeler transform"""
    
    def circular_permute(S):
        """Returns an array with all the circular permutations of S"""
        perms = [None]*len(S)
        for i in range(len(S)):
            pre = S[:i]
            end = S[i:]
            row = end + pre
            perms[i] = row
        return perms
    
    perms = circular_permute(S)
    perms.sort()
    
    S_BTed = ''
    for p in perms:
        S_BTed += p[len(S)-1]
    
    return S_BTed

In [3]:
def inverse_BT(S):
    """Given a Burrows-Wheeler transformed string, finds the original string."""
    for i in range(len(S)):
        if i == 0:
            cols = sorted(S)
        else:
            for i in range(len(S)):
                cols[i] = S[i] + cols[i]
            cols = sorted(cols)
    for word in cols:
        if word[-1] == '$':
            return word

## Check the functions work as implemented:

In [4]:
BT("^BANANANA|")

'BNNN^AAA|A'

In [5]:
inverse_BT(BT('BANANA$'))

'BANANA$'

## Invert the BT-transformed string given in the homework:

In [6]:
inverse_BT('ACTTCCCGGAAAAA$TTAA')

'GATTACACACAGATTACA$'

In [10]:
inverse_BT('AABBAB^ABBAABBAA')

# Counting Human Genome Stuff

In [7]:
# I copied-pastad the code from Hwk1 and modified it.
letters = 0
txs = 0
largestN = 0
largestT = ''

with open("../input/human_cdna.fa", 'r') as handle:
    for record in SeqIO.parse(handle, "fasta") :
        txs += 1
        letters += len(record.seq)
        if largestN < len(record.seq):
            largestN = len(record.seq)
            largestT = record.id
print("The number of nucleotides in human cDNA sequences is {0}".format(letters))
print("There are {0} transcripts in the human transcriptome".format(txs))
print('The largest transcript in human cDNA is {0}'.format(largestT))

The number of nucleotides in human cDNA sequences is 294306140
There are 180869 transcripts in the human transcriptome
The largest transcript in human cDNA is ENST00000589042.5
