diff --git a/docs/programmers/tutorials/workflows/downloads/saplot.py b/docs/programmers/tutorials/workflows/downloads/saplot.py new file mode 100755 index 00000000..99a2416b --- /dev/null +++ b/docs/programmers/tutorials/workflows/downloads/saplot.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python +""" +Make a line plot all price paths produced by `simAsset.R`, +together with their average value at any given time. +""" +import csv + +import matplotlib.pyplot as plt + + +plt.style.use('ggplot') + +data = open('results.csv') +rows = csv.reader(data) +ys = [] +max_y = 0 +for row in rows: + y = list(float(item) for item in row) + max_y = max(max_y, max(y)) + ys.append(y) + +fig = plt.figure() + +# plot "hairy" +for y in ys: + x = range(len(y)) + plt.plot(x, y, linestyle='solid', color='chartreuse', alpha=(1.0/8)) + +avgs = [] +ts = zip(*ys) +N = len(ys) +for t in ts: + avg = sum(t) / N + avgs.append(avg) +plt.plot(x, avgs, linestyle='solid', linewidth=2, color='darkred', alpha=1.0) + +plt.ylim(0, max_y) +#plt.show() + +fig.savefig("saplot.pdf") diff --git a/docs/programmers/tutorials/workflows/part08.pdf b/docs/programmers/tutorials/workflows/part08.pdf index 6b29d4e0..21ff987d 100644 Binary files a/docs/programmers/tutorials/workflows/part08.pdf and b/docs/programmers/tutorials/workflows/part08.pdf differ diff --git a/docs/programmers/tutorials/workflows/part08.tex b/docs/programmers/tutorials/workflows/part08.tex index e02241f9..74c47e16 100644 --- a/docs/programmers/tutorials/workflows/part08.tex +++ b/docs/programmers/tutorials/workflows/part08.tex @@ -18,7 +18,7 @@ \\[1ex] University of Zurich } -\date{November~14--17, 2016} +\date{January~23--27, 2017} \begin{document} @@ -88,66 +88,26 @@ \end{frame} -\begin{frame}[fragile] - \frametitle{Detour: BLAST, again} - - Another use of the BLAST tool is to search for given ``query'' - proteins in a data base. Large curated DBs are available, but one - may want to build a custom DB. - - \+ - Building a DB from a set of FASTA-format files \texttt{p1.faa} - \texttt{p2.faa} and \texttt{p3.faa}, and querying it is a 3-step - process: -\begin{sh} - cat p1.faa p2.faa p3.faa > db.faa - formatdb -i db.faa - blastpgp -i q.faa -d db.faa -e ... -\end{sh} - - \+ The \texttt{formatdb} step produces output files - \texttt{db.faa.phr}, \texttt{db.faa.pin}, and \texttt{db.faa.psq}; all - these files are \emph{inputs} to the \texttt{blastpgp} program. -\end{frame} - - -\begin{frame}[fragile] - \begin{exercise*}[8.A] - Write a \texttt{blastdb.py} script to build a BLAST DB and query it. - - \+ - The \texttt{blastdb.py} script shall be invoked like this: -\begin{sh} -$ python topblast.py query.faa p1.faa [p2.faa ...] -\end{sh}%$ - where arguments \texttt{new.faa}, \texttt{p1.faa}, etc. are FASTA-format files. - - \+ - The script should build a BLAST DB out of the files {p$N$.faa}. - Then, it should query this database for occurrences of the - proteins in \texttt{query.faa} using \texttt{blastpgp}. - \end{exercise*} -\end{frame} - \begin{frame} - \begin{exercise*}[8.B] - Find out by running the \texttt{blastdb.py} script of Ex.~8.A: + \begin{exercise*}[8.A] - \+ + Write a \texttt{priceplot.py} script that performs the following two steps: \begin{enumerate} - \item What happens if an intermediate step fails and does not - produce complete output? + \item Run the + \href{https://github.com/uzh/gc3pie/blob/master/docs/programmers/tutorials/workflows/downloads/simAsset.R}{\texttt{simAsset.R}} + script (from Exercise~6.D) with the parameters given on the command line, + and + \item Feed the \texttt{results.csv} file it outputs into the + \href{https://github.com/uzh/gc3pie/blob/master/docs/programmers/tutorials/workflows/downloads/saplot.py}{\texttt{saplot.py}} + script and retrieve the produced \texttt{saplot.pdf} file. + \end{enumerate} - \+ - \item After the whole sequence turns to TERMINATED state, what is - the value of its signal and exitcode? + \+ + Run it like \texttt{simAsset.R}, for example: +\begin{semiverbatim} + \$ python priceplot.py 50 0.04 0.1 0.27 10 40 +\end{semiverbatim} - \+ - \item How could you implement a ``cleanup'' feature that removes - intermediate results (e.g., the ``\texttt{.phr}'' files) and - only keeps the output from \texttt{blastpgp} \textbf{if the - whole sequence was successfully executed?} - \end{enumerate} \end{exercise*} \end{frame} @@ -169,36 +129,30 @@ \begin{frame}[fragile] \begin{columns}[t] \begin{column}{0.6\textwidth} - \begin{lstlisting} +\begin{lstlisting}[basicstyle=\footnotesize\ttfamily] class Pipeline~\HL{(StagedTaskCollection)}~: def __init__(self, image): - StagedTaskCollection.__init__(self) + super(Pipeline).__init__(self) self.source = image def stage0(self): - # run 1st step - return Application(...) + # ... def stage1(self): - if self.tasks[0].execution.exitcode != 0: - # set collection signal and exit code, - # and state to TERMINATED - return (0, 1) - else: - # run 2nd step - return Application(...) + # ... # ... - def stage~$N$~(self): + def stage~{\bfseries\itshape X}~(self): # ... \end{lstlisting} \end{column} \begin{column}{0.4\textwidth} \raggedleft - \+\+ - Example of~a \texttt{StagedTaskCollection} - subclass. + \+\+\small + Example: \\ + subclassing a + \texttt{StagedTaskCollection} \end{column} \end{columns} \end{frame} @@ -207,10 +161,10 @@ \begin{frame}[fragile] \begin{columns}[c] \begin{column}{0.6\textwidth} - \begin{lstlisting} + \begin{lstlisting}[basicstyle=\footnotesize\ttfamily] class Pipeline(StagedTaskCollection): def __init__(self, image): - StagedTaskCollection.__init__(self) + super(Pipeline).__init__(self) self.source = image def ~\HL{stage0(self)}~: @@ -220,7 +174,7 @@ # ... # ... - def ~\HL{stage$\mathbf N$(self)}~: + def ~\HL{stage{\bfseries\itshape X}(self)}~: # ... \end{lstlisting} \end{column} @@ -239,7 +193,7 @@ \begin{frame}[fragile] \begin{columns}[c] \begin{column}{0.6\textwidth} - \begin{lstlisting} + \begin{lstlisting}[basicstyle=\footnotesize\ttfamily] class Pipeline(StagedTaskCollection): # ... @@ -258,8 +212,8 @@ \begin{column}{0.4\textwidth} \raggedleft - Each \texttt{stage$N$} method can return a \texttt{Task} - instance, that will run as step $N$ in the sequence. + Each \texttt{stage{\itshape X}} method can return a \texttt{Task} + instance, that will run as the $X$-th step in the sequence. \end{column} \end{columns} \end{frame} @@ -268,21 +222,20 @@ \begin{frame}[fragile] \begin{columns}[c] \begin{column}{0.6\textwidth} - \begin{lstlisting} + \begin{lstlisting}[basicstyle=\footnotesize\ttfamily] class Pipeline(StagedTaskCollection): # ... def stage1(self): - ~\HL{if self.tasks[0].execution.exitcode != 0:}~ - # set collection signal and exit code, - # and state to TERMINATED + ~\HL{\textbf{if} \textbf{\color{gray} self}.tasks[0].execution.exitcode != 0:}~ + # bail out return (0, 1) else: # run 2nd step return Application(...) # ... - def stage~$N$~(self): + def stage~{\bfseries\itshape X}~(self): # ... \end{lstlisting} \end{column} @@ -301,21 +254,20 @@ \begin{frame}[fragile] \begin{columns}[c] \begin{column}{0.6\textwidth} - \begin{lstlisting} + \begin{lstlisting}[basicstyle=\footnotesize\ttfamily] class Pipeline(StagedTaskCollection): # ... def stage1(self): - if self.tasks[0].execution.exitcode != 0: - # set collection signal and exit code, - # and state to TERMINATED + if ...: + # bail out ~\HL{return (0, 1)}~ else: # run 2nd step return Application(...) # ... - def stage~$N$~(self): + def stage~{\bfseries\itshape X}~(self): # ... \end{lstlisting} \end{column} @@ -326,22 +278,79 @@ To abort the sequence, return an integer (termination status) or a pair \emph{(signal, exit code)}, instead of a \texttt{Task} instance. + + \+ + This sets the collections' own signal and exit code, and also sets the + state as \texttt{TERMINATED}. \end{column} \end{columns} \end{frame} +\begin{frame}[fragile] + \frametitle{Detour: BLAST, again} + + Another use of the BLAST tool is to search for given ``query'' + proteins in a data base. Large curated DBs are available, but one + may want to build a custom DB. + + \+ + Building a DB from a set of FASTA-format files \texttt{p1.faa} + \texttt{p2.faa} and \texttt{p3.faa}, and querying it is a 3-step + process: +\begin{sh} + cat p1.faa p2.faa p3.faa > db.faa + formatdb -i db.faa + blastpgp -i q.faa -d db.faa -e ... +\end{sh} + + \+ The \texttt{formatdb} step produces output files + \texttt{db.faa.phr}, \texttt{db.faa.pin}, and \texttt{db.faa.psq}; all + these files are \emph{inputs} to the \texttt{blastpgp} program. +\end{frame} + + +\begin{frame}[fragile] + \begin{exercise*}[8.B] \emph{(Difficult)} + + Write a \texttt{blastdb.py} script to build a BLAST DB and query it. + + \+ + The \texttt{blastdb.py} script shall be invoked like this: +\begin{sh} +$ python blastdb.py query.faa p1.faa [p2.faa ...] +\end{sh}%$ + where arguments \texttt{new.faa}, \texttt{p1.faa}, etc. are FASTA-format files. + + \+ + The script should build a BLAST DB out of the files {p$N$.faa}. + Then, it should query this database for occurrences of the + proteins in \texttt{query.faa} using \texttt{blastpgp}. + \end{exercise*} +\end{frame} + + \begin{frame} \begin{exercise*}[8.C] - Rewrite the \texttt{blastdb.py} script from Ex.~8.A to use a - \texttt{StagedTaskCollection} and be sure to check that a step is - successful before proceeding to the next one. + Find out by running the \texttt{blastdb.py} script of Ex.~8.B: \+ - Upon successful completion of the pipeline, move the - \texttt{blastpgp} output into directory - \texttt{/home/ubuntu/results} and then delete all intermediate - files and directories. + \begin{enumerate} + \item What happens if an intermediate step fails and does not + produce complete output? + + \+ + \item After the whole sequence turns to TERMINATED state, what is + the value of its signal and exitcode? + \end{enumerate} + \end{exercise*} + + \+ + \begin{exercise*}[8.D] + Implement (in \texttt{blastdb.py}) a ``cleanup'' feature that removes + intermediate results (e.g., the ``\texttt{.phr}'' files) and only keeps the + output from \texttt{blastpgp} \emph{if the whole sequence was successfully + executed}. \end{exercise*} \end{frame} diff --git a/docs/programmers/tutorials/workflows/solutions/ex8b.py b/docs/programmers/tutorials/workflows/solutions/ex8b.py new file mode 100755 index 00000000..905651fb --- /dev/null +++ b/docs/programmers/tutorials/workflows/solutions/ex8b.py @@ -0,0 +1,115 @@ +#! /usr/bin/env python + +""" +Write a `blastdb.py` script to build a BLAST DB and query it. + +The `blastdb.py` script shall be invoked like this: + + $ python blastdb.py query.faa p1.faa [p2.faa ...] + +where arguments `new.faa`, `p1.faa`, etc. are FASTA-format files. + +The script should build a BLAST DB out of the files `pN.faa`. +Then, it should query this database for occurrences of the +proteins in `query.faa` using `blastpgp`. +""" + +import os +from os.path import abspath, basename +import sys + +from gc3libs import Application +from gc3libs.cmdline import SessionBasedScript +from gc3libs.quantity import GB, minutes + + +if __name__ == '__main__': + from ex8b import BlastDbScript + BlastDbScript().run() + + +class BlastDbScript(SessionBasedScript): + """ + Run BLAST on pairs of FAA files. + """ + def __init__(self): + super(BlastDbScript, self).__init__(version='1.0') + def setup_args(self): + self.add_param('new_faa', help="Query FAA") + self.add_param('known_faa', nargs='+', + help="Samples to compare to the query") + def setup_options(self): + self.add_param('--e-value', '-e', + type=float, help="Expectation value") + self.add_param('--output-format', '-o', dest='output_fmt', + type=int, help="Output format, int from 0 to 9") + def new_tasks(self, extra): + return [ BlastDbTasks(self.params.new_faa, self.params.known_faa, + self.params.e_value, self.params.output_fmt) ] + + +class BlastDbTasks(StagedTaskCollection): + + def __init__(self, query, subjects, e_value, output_fmt): + self.query = query + self.subjects = subjects + self.e_value = e_value + self.output_fmt = output_fmt + + def stage0(self): + return ConcatFastaApp('db.faa', self.subjects) + + def stage1(self): + return FormatDbApp(join(self.tasks[0].output_dir, 'db.faa')) + + def stage2(self): + return BlastpgpApp(self.query, join(self.tasks[1].output_dir, 'db.faa'), self.e_value, self.output_fmt) + + +class ConcatFastaApp(Application): + """Merge FASTA files.""" + def __init__(self, output_name, input_files): + input_names = [basename(infile) for infile in input_files] + Application.__init__( + self, + arguments=(["cat"] + input_names + [">", output_name]) + inputs=input_files, + outputs=[output_name], + output_dir=("cat-" + output_name + ".d"), + stdout=None, # redirection operator `>` already does it + stderr="errors.txt", + requested_memory=1*GB) + + +class FormatDbApp(Application): + """Index a (large) FASTA file.""" + def __init__(self, input_file_path): + input_name = basename(input_file_path) + output_names = [input_name+suffix for suffix in ('', '.phr', '.pin', '.psq')] + Application.__init__( + self, + arguments=["formatdb", "-i", input_name], + inputs=[input_file_path], + outputs=[output_names], + output_dir=("formatdb-" + input_name + ".d"), + stdout="formatdb.log", + stderr="formatdb.log", + requested_memory=1*GB) + + +class BlastpgpApp(Application): + """Run BLAST on two files.""" + def __init__(self, query_file_path, db_files_path, e_value, output_fmt): + q = basename(query_file_path) + db = basename(db_files_path) + db_files = [db_files_path+suffix for suffix in ('', '.phr', '.pin', '.psq')] + Application.__init__( + self, + arguments=["blastpgp", "-i", q, "-d", db, + "-e", e_value, "-m", output_fmt, "-o", "output.txt"], + inputs=([q] + db_files), + outputs=["output.txt"], + output_dir=("blast-" + inp1 + "-" + inp2 + ".d"), + stdout=None, # BLAST's option `-o` already does this + stderr="errors.txt", + requested_memory=1*GB) diff --git a/docs/programmers/tutorials/workflows/solutions/priceplot.py b/docs/programmers/tutorials/workflows/solutions/priceplot.py new file mode 120000 index 00000000..fcd0662e --- /dev/null +++ b/docs/programmers/tutorials/workflows/solutions/priceplot.py @@ -0,0 +1 @@ +ex8a.py \ No newline at end of file