# Workflow for using the profiler (in python).

In [28]:
# imports
import os
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from itertools import cycle
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
pd.set_option('max_rows',1000)
import os
import math
from matplotlib.backends.backend_pdf import PdfPages
import scipy
from scipy import stats
import subprocess

### RNAstructure
#### In order to use the profiler, you must use sampling from RNAstructure and then process it so that the profiler can use the output. This must be used for each sequence that you want to run individually.
#### I'm going to use the command line version of RNAstructure, so all instructions below will be in reference to that. Installation instructions can be found here: http://rna.urmc.rochester.edu/Overview/Installation_Instructions_Mac.html

To use RNAstructure on the command line, you must set a path for the variable DATAPATH. Change prefix as necessary based on where you've installed it. There are different instructions based on whether you use a BASH environment or other environments (I use BASH, so that's what I will document)

export DATAPATH=/Users/csimonti3/Work/opt/RNAstructure/data_tables/

In [4]:
# Run on test N2 rad50 fasta file.
filename = "/Volumes/users/Corinne_Simonti/projects/RNA/data/fa/test.fa"
outfilename = "/Volumes/users/Corinne_Simonti/projects/RNA/data/ct/test.ct"

subprocess.run(['/Users/csimonti3/Work/opt/RNAstructure/exe/stochastic', filename, outfilename, '--sequence'])

CompletedProcess(args=['/Users/csimonti3/Work/opt/RNAstructure/exe/stochastic', '/Volumes/users/Corinne_Simonti/projects/RNA/data/fa/test.fa', '/Volumes/users/Corinne_Simonti/projects/RNA/data/ct/test.ct', '--sequence'], returncode=0)

In [19]:
# Convert ct to gtboltzmann.
fn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/ct/test.ct"
outfn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/gtboltz/test.gtboltz"

subprocess.run(["python", "/Users/csimonti3/Work/opt/RNAsp/src/RNAStructure_to_gtboltzmann.py", fn, outfn])

CompletedProcess(args=['python', '/Users/csimonti3/Work/opt/RNAsp/src/RNAStructure_to_gtboltzmann.py', '/Volumes/users/Corinne_Simonti/projects/RNA/data/ct/test.ct', '/Volumes/users/Corinne_Simonti/projects/RNA/data/ct/test.gtboltz'], returncode=0)

In [31]:
# Convert gtboltzmann to sfold
fn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/gtboltz/test.gtboltz"
outfn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/sfold/test.sfold"

subprocess.run(["python", "/Users/csimonti3/Work/opt/RNAsp/src/GTBoltzmann_to_sfold.py", fn, outfn])

CompletedProcess(args=['python', '/Users/csimonti3/Work/opt/RNAsp/src/GTBoltzmann_to_sfold.py', '/Volumes/users/Corinne_Simonti/projects/RNA/data/gtboltz/test.gtboltz', '/Volumes/users/Corinne_Simonti/projects/RNA/data/sfold/test.sfold'], returncode=0)

In [39]:
# Run profiler on test file.
gtfn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/gtboltz/test.gtboltz"
sffn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/sfold/test.sfold"
seqfn = "/Volumes/users/Corinne_Simonti/projects/RNA/data/fa/test.fa"
outfn = "/Volumes/users/Corinne_Simonti/projects/RNA/results/structures/test"

#subprocess.run(["/Users/csimonti3/Work/opt/RNAsp/src/RNAprofile", "-v", "--sc", "-o", outfn, "-e", gtfn,  seqfn])
#subprocess.run(["/Users/csimonti3/Work/opt/RNAsp/src/RNAprofile", "-v", "-g", "--sc", "-o", outfn, "--sfold", sffn,  seqfn])
result = subprocess.run(["/Users/csimonti3/Work/opt/RNAsp/src/RNAprofile", 
                         "-v", "--sc", "-e", gtfn,  seqfn], stdout=subprocess.PIPE)

outfile = open(outfn + ".out", 'w')
for i in result.stdout.decode('utf-8'):
    outfile.write(i)
    
outfile.close()

In [25]:
print("/Users/csimonti3/Work/opt/RNAsp/src/RNAprofile", "-v", "--sc", "-o", outfn, "-e", gtfn,  seqfn)

/Users/csimonti3/Work/opt/RNAsp/src/RNAprofile -v --sc -o /Volumes/users/Corinne_Simonti/projects/RNA/results/structures/test -e /Volumes/users/Corinne_Simonti/projects/RNA/data/gtboltz/test.gtboltz /Volumes/users/Corinne_Simonti/projects/RNA/data/fa/test.fa


### Scale up for  let7/lin4 file.

In [12]:
# Test what I'm sure is a very inefficient way to do this.
seq = "ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTG"
newseq = []

for i in seq:
    if i != "T":
        newseq.append(i)
    else:
        newseq.append("U")

('').join(newseq)

'ACUGACUGACUGACUGACUGACUGACUGACUGACUGACUGACUG'

In [13]:
# Convert sequences to temporary FASTA files.
prefix = "/Volumes/users/Corinne_Simonti/projects/RNA/"

SEQ_FILE = prefix + "results/3UTR_slg_seqs_let7-lin4.txt"

file_list = []

for line in open(SEQ_FILE):
    if line[0] == "#":
        continue
    seg = line.split('\t')
    ID = seg[1]
    GENE = ID.split('|')[4]
    NSEQ = seg[2]
    HSEQ = seg[3].rstrip('\n')
    outfn = prefix + "data/fa/" + GENE + "N2.fa"
    outfile = open(outfn, 'w')
    outfile.write(">" + ID + "\n")
    newseq = []
    for i in NSEQ:
        if i != "T":
            newseq.append(i)
        else:
            newseq.append("U")
    outfile.write(('').join(newseq))
    outfile.close()
    file_list.append(outfn)
    outfn = prefix + "data/fa/" + GENE + "Hawaii.fa"
    outfile = open(outfn, 'w')
    outfile.write(">" + ID + "\n")
    newseq = []
    for i in NSEQ:
        if i != "T":
            newseq.append(i)
        else:
            newseq.append("U")
    outfile.write(('').join(newseq))
    outfile.close()
    file_list.append(outfn)

print("%d files written." % len(file_list))

12 files written.


In [17]:
# Run RNAstructure.
for fn in file_list:
    seg = fn.split('/')
    gene = seg[-1].split('.')
    if len(gene) > 2:
        GENE = ('.').join(gene[0:-1])
    else:
        GENE = gene[0]
    outfn = prefix + "data/ct/" + GENE + ".ct"
    subprocess.run(['/Users/csimonti3/Work/opt/RNAstructure/exe/stochastic', fn, outfn, '--sequence'])

In [None]:
# Convert to GTBoltzmann