In [1]:
import numpy as np
import pandas as pd
from nlabpy.parse.seq import parse_fasta

## Download and extract `hg19` assembly

In [2]:
ls -lah ../ref

total 16K
drwxrwsr-x 3 ilya lab 4.0K Feb  2 15:46 [0m[01;34m.[0m/
drwxrwsr-x 7 ilya lab 4.0K Feb  2 15:43 [01;34m..[0m/
drwxrwsr-x 3 ilya lab 4.0K Feb  2 14:27 [01;34mHomo_sapiens[0m/
-rw-rw-r-- 1 ilya lab  217 Feb  2 15:46 illuminaClipping.fa


In [5]:
%%bash

wget ftp://igenome:G3nom3s4u@ussd-ftp.illumina.com/Homo_sapiens/UCSC/hg19/Homo_sapiens_UCSC_hg19.tar.gz \
    --directory-prefix=../ref
tar -xzvf ../ref/Homo_sapiens_UCSC_hg19.tar.gz -C ../ref
rm ../ref/Homo_sapiens_UCSC_hg19.tar.gz

Homo_sapiens/UCSC/hg19/
Homo_sapiens/UCSC/hg19/Annotation/
Homo_sapiens/UCSC/hg19/Annotation/Genes
Homo_sapiens/UCSC/hg19/Annotation/README.txt
Homo_sapiens/UCSC/hg19/Annotation/SmallRNA
Homo_sapiens/UCSC/hg19/Annotation/Variation
Homo_sapiens/UCSC/hg19/Annotation/Archives/
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/genes.gtf
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/ChromInfo.txt
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/refSeqSummary.txt
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/cytoBand.txt
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/refFlat.txt.gz
Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2011-08-30-21-45-18/Genes/knownGene.txt
Homo_sapiens/UCSC/hg19/Annota

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Build `GTF` dataframe from the `lncRNA` annotation file

In [3]:
def gtf_df(filename):
    res = []
    with open(filename, 'rt') as fi:
        for line in fi:
            fields = line.strip().split('\t')
            if fields[2] == 'exon':
                rec = {}
                idfields = fields[8].strip().split(';')
                for idfield in idfields:
                    if idfield:
                        key, val = idfield.split()
                        if key == 'transcript_id' or key == 'exon_number':
                            rec.update({key: val.strip('"')})

                rec.update({'chr': fields[0],
                          'start': int(fields[3]),
                          'end': int(fields[4])})
                res.append(rec)
    return pd.DataFrame.from_records(res)

In [5]:
gtf = gtf_df('../ref/lncRNA.gtf')
gtf

Unnamed: 0,chr,end,exon_number,start,transcript_id
0,chr2,137087174,1,137086878,TCONS_tallLncRNA_00203783
1,chr2,137087502,2,137087403,TCONS_tallLncRNA_00203783
2,chr2,137087222,1,137086878,TCONS_tallLncRNA_00203782
3,chr2,137087502,2,137087381,TCONS_tallLncRNA_00203782
4,chr2,137086987,1,137086771,lnrCXCR4
5,chr2,137087419,2,137087039,lnrCXCR4
6,chr2,137087809,3,137087491,lnrCXCR4
7,chr2,137087483,1,137087011,ENST00000392399.3


## Extract the sequence of the locus annotated in `lncRNA.gtf` plus 500 bp on each side

In [6]:
parser = parse_fasta('../ref/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/chr2.fa')
_, chr2 = next(parser)

def get_seqs(rec):
    return chr2[rec.start:rec.end]

gtf['sequence'] = gtf.apply(get_seqs, axis=1)
gtf['seq_length'] = gtf['end'] - gtf['start']
gtf

Unnamed: 0,chr,end,exon_number,start,transcript_id,sequence,seq_length
0,chr2,137087174,1,137086878,TCONS_tallLncRNA_00203783,CAAAACCTTTATTAGCATTTTGAACAGGTTCAGCTATTACTGAAAC...,296
1,chr2,137087502,2,137087403,TCONS_tallLncRNA_00203783,TTCACATTTTCGATGGTGTCCCTGGGCTCCACTTCAAGGGCAATGG...,99
2,chr2,137087222,1,137086878,TCONS_tallLncRNA_00203782,CAAAACCTTTATTAGCATTTTGAACAGGTTCAGCTATTACTGAAAC...,344
3,chr2,137087502,2,137087381,TCONS_tallLncRNA_00203782,TTCCTTATCCTGGATCTTGGCCTTCACATTTTCGATGGTGTCCCTG...,121
4,chr2,137086987,1,137086771,lnrCXCR4,GTTTATTCAAGGACAAGCAGTCTGAGAAATGGAGTTTTTGAAATAA...,216
5,chr2,137087419,2,137087039,lnrCXCR4,GGTGCAGGGTTGACTCTTTCTGGATGTTGTAGTCAGAAAGAGTGCG...,380
6,chr2,137087809,3,137087491,lnrCXCR4,GTTAGCGGATATGACGAGGCTCCGAAACACCAGTCATGTCCAGCCA...,318
7,chr2,137087483,1,137087011,ENST00000392399.3,TAACAGCCACCCCTCAGGTGCAGGACCAGGTGCAGGGTTGACTCTT...,472


In [7]:
fa_tpl = '>{}'
with open('../ref/ref_locus.fa', 'wt') as fo:
    header = fa_tpl.format('lnrCXCR4')
    fo.write('{}\n{}\n'.format(header, chr2[gtf.start.min()-500:gtf.end.max()+500]))

!head ../ref/ref_locus.fa

>lnrCXCR4
AGGAGTTTCCAGGTGACCCCTGGAAGTCCCAGTGCATTGCAGTCTTAGCACATTGCTCgagaaggtgagggagaagaagagagaaatgaaagaaaatttccagatgaagaaaagacaggaaagacagaggaagaaaggagggagggagattgaataaaagaaagagggagaaggtgaagaaggaaagagagagagagaATATATATAACGCTTTTAGGTGTTACCTTTGATCAGGGCGATTGACCAAGGTCAGCTTTCTTCAACGTGTATTCAGAGGAGGGCTCATGTCCTATAAGGTATTCATTGGTGTTTTACGGGGGAAATTTTTAAAAAGTGGGGCAGGGAAATCCACTGGTCCCACCCATTTGGGAAGTGTTTGgttcagcaggtttctctggtgtagctcctctcagagcctttcgtaaactggagtgcattatggagctccaagatggggccatagtatacaatttctccttacattatttTATTGAGATATTGTTTATTCAAGGACAAGCAGTCTGAGAAATGGAGTTTTTGAAATAATGATCCAGGCCTTTCCTGCAACACTGAGCTGTTTCTTTCCTTTTCTTTTTTAACCATGCAACAAAACCTTTATTAGCATTTTGAACAGGTTCAGCTATTACTGAAACTTGTAATTTCTAAACTTAAGTTGGGGCAAATGGCTATACGGCAGAGTAATGCCATCACTGGGCACTGCGAATGCAAGACTGGAGAATTAACAGCCACCCCTCAGGTGCAGGACCAGGTGCAGGGTTGACTCTTTCTGGATGTTGTAGTCAGAAAGAGTGCGGCCATCTTCCAGCTGCTTGCCTGCAAAGATGAGCCTCTGCTGGTCGGGGCTGGGGGTGGGGGGGTGCCTTCTTTATCCTGGATCTTGGCCTTCACATTTTCCATGGTGTCACTGGGCTCCACTTCCAGGGTGATGGTCTTGCCAGTCAGGGTCTTCACGAAGATCTGCATACCAC

## Build `bowtie2` index for the locus reference

In [8]:
%%bash

bowtie2-build ../ref/ref_locus.fa ../ref/lncRNA_locus

Settings:
  Output files: "../ref/lncRNA_locus.*.bt2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Max bucket size: default
  Max bucket size, sqrt multiplier: default
  Max bucket size, len divisor: 4
  Difference-cover sample period: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  ../ref/ref_locus.fa
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
bmax according to bmaxDivN setting: 509
Using parameters --bmax 382 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 382 --dcv 1024
Constructing suffix-array element generator
Bui

Building a SMALL index
