This repository has been archived by the owner on Jun 11, 2020. It is now read-only.
forked from rohanmaddamsetti/DM0-evolution
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-kallisto.py
115 lines (93 loc) · 4.05 KB
/
run-kallisto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
'''
run-kallisto.py by Rohan Maddamsetti.
This script does two tasks.
1) make FASTA file of all ORFs in REL606.
2) concatenates RNAseq fastq files by genome and biological replicate.
NOTE: this will run on my office computer-- not on my laptop.
'''
from os.path import join, exists, dirname, realpath
from os import makedirs, listdir
from Bio import SeqIO
import pandas as pd
import subprocess
## need shlex so that we don't split on strings in double quotes.
import shlex
def quote_string(x):
''' add double quotes to a string, in case it contains spaces or
other weirdness. This is required for system calls using the
shell, but not when working within python itself. '''
return('\"' + x + '\"')
def generateORF_fasta(ref_file,outf):
with open(outf,'w') as outfh:
rec = SeqIO.read(open(ref_file),"genbank")
for feat in rec.features:
## only consider CDS for now.
if feat.type != 'CDS':
continue
locus_tag = 'NA'
gene = 'NA'
Note = 'NA'
location = str(feat.location)
ORF_seq = feat.location.extract(rec).seq
try:
locus_tag = feat.qualifiers['locus_tag'][0]
except KeyError:
print('warning: locus_tag missing')
try:
gene = feat.qualifiers['gene'][0]
except KeyError:
print('warning: gene name missing')
outfh.write(">locus_tag=%s|gene=%s\n%s\n" % (
locus_tag,
gene,
ORF_seq
))
def generate_kallisto_quant_cmds(RNAseqdir,samples_f,outdir):
cmd_strings = []
sample_df = pd.read_csv(samples_f)
grouped = sample_df.groupby(['Genome','BiologicalReplicate'])
for name, group in grouped:
search_strs = [str(x) for x in group['Admera Health ID']]
input_files = []
for s in search_strs:
in_f = [join(RNAseqdir,x) for x in listdir(RNAseqdir) if s in x]
input_files = input_files + in_f
## use gzcat to speed up kallisto quant.
input_files = ['<(gzcat ' + quote_string(x) + ')' for x in input_files]
input_fstring = ' '.join(input_files)
output_file = join(outdir,'_'.join([str(x) for x in name]))
idx_file = join('..','results','RNAseq-analysis','LCA_ORFs.idx')
cmd = 'kallisto quant -i ' + quote_string(idx_file) + ' -o ' + quote_string(output_file) + ' -b 100 ' + input_fstring
cmd_strings.append(cmd)
return cmd_strings
def main():
srcdir = dirname(realpath(__file__))
assert srcdir.endswith('src')
projdir = dirname(srcdir)
assert projdir.endswith('DM0-evolution')
RNAseq_analysis_dir = join(projdir,"results","RNAseq-analysis")
if not exists(RNAseq_analysis_dir):
makedirs(RNAseq_analysis_dir)
LCAref = join(projdir,'genomes","curated-diffs","LCA.gbk')
ORF_file = join(RNAseq_analysis_dir,"LCA_ORFs.fasta")
## this is surprisingly slow. Only run if the file doesn't exist.
if not exists(ORF_file):
generateORF_fasta(LCAref,ORF_file)
raw_RNAseq_dir = join(projdir,"Kenyon-RNAseq")
RNAseq_samples_f = join(projdir,"data","rohan-formatted","Kenyon-RNAseq-samples.csv")
## run kallisto index.
fasta_input = join(projdir, 'results','RNAseq-analysis','LCA_ORFs.fasta')
idx_output = join(projdir, 'results','RNAseq-analysis','LCA_ORFs.idx')
index_cmd = ' '.join(['kallisto index --make-unique -i', quote_string(idx_output), quote_string(fasta_input)])
## only run if the index doesn't exist.
if not exists(idx_output):
subprocess.run(index_cmd,shell=True,executable='bash')
quant_cmds = generate_kallisto_quant_cmds(raw_RNAseq_dir,RNAseq_samples_f, RNAseq_analysis_dir)
for qcmd in quant_cmds:
outdir = shlex.split(qcmd)[5]
## only run kallisto quant if the outdir doesn't exist.
if not exists(outdir):
print(qcmd)
subprocess.run(qcmd,shell=True,executable='bash')
main()