diff --git a/actions/process_pdfs.rb b/actions/process_pdfs.rb index df76aa6..0cbd111 100644 --- a/actions/process_pdfs.rb +++ b/actions/process_pdfs.rb @@ -6,8 +6,8 @@ # See examples/process_pdfs_example.rb for more information. class ProcessPdfs < CloudCrowd::Action - # Split up a large pdf into single-page pdfs. - # The double pdftk shuffle fixes the document xrefs. + # Split up a large pdf into single-page pdfs. Batch them into 'batch_size' + # chunks for processing. The double pdftk shuffle fixes the document xrefs. def split `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"` FileUtils.rm input_path diff --git a/actions/word_count.rb b/actions/word_count.rb new file mode 100644 index 0000000..f9ba74e --- /dev/null +++ b/actions/word_count.rb @@ -0,0 +1,14 @@ +# A WordCount, the canonical MapReduce Demo. Depends on the 'wc' utility. +class WordCount < CloudCrowd::Action + + # Count the words in a single book. + def process + (`wc #{input_path}`).match(/\A\s*(\d+)/)[1].to_i + end + + # Sum the total word count. + def merge + JSON.parse(input).inject(0) {|sum, count| sum + count } + end + +end \ No newline at end of file diff --git a/cloud-crowd.gemspec b/cloud-crowd.gemspec index e808ba8..8024138 100644 --- a/cloud-crowd.gemspec +++ b/cloud-crowd.gemspec @@ -47,6 +47,7 @@ Gem::Specification.new do |s| s.files = %w( actions/graphics_magick.rb actions/process_pdfs.rb +actions/word_count.rb cloud-crowd.gemspec config/config.example.ru config/config.example.yml @@ -54,6 +55,7 @@ config/database.example.yml EPIGRAPHS examples/graphics_magick_example.rb examples/process_pdfs_example.rb +examples/word_count_example.rb lib/cloud-crowd.rb lib/cloud_crowd/action.rb lib/cloud_crowd/app.rb diff --git a/examples/graphics_magick_example.rb b/examples/graphics_magick_example.rb old mode 100644 new mode 100755 index 62e60d2..351e891 --- a/examples/graphics_magick_example.rb +++ b/examples/graphics_magick_example.rb @@ -1,11 +1,13 @@ -# Inside of a restclient session: -# This is a fancy example that produces black and white, annotated, and blurred -# versions of a list of URLs downloaded from the web. +#!/usr/bin/env ruby -rubygems +require 'restclient' require 'json' -RestClient.post( - 'http://localhost:9173/jobs', +# This example demonstrates the GraphicsMagick action by taking in a list of +# five images, and producing annotated, blurred, and black and white versions +# of each image. See actions/graphics_magick.rb + +RestClient.post('http://localhost:9173/jobs', {:job => { 'action' => 'graphics_magick', @@ -39,10 +41,4 @@ } }.to_json} -) - -# status = RestClient.get('http://localhost:9173/jobs/[job_id]') - -# puts JSON.parse(RestClient.get('http://localhost:9173/jobs/[job_id]'))['outputs'].values.map {|v| -# JSON.parse(v).map {|v| v['url']} -# }.flatten.join("\n") \ No newline at end of file +) \ No newline at end of file diff --git a/examples/process_pdfs_example.rb b/examples/process_pdfs_example.rb old mode 100644 new mode 100755 index 337d2ad..fd857fc --- a/examples/process_pdfs_example.rb +++ b/examples/process_pdfs_example.rb @@ -1,3 +1,13 @@ +#!/usr/bin/env ruby -rubygems + +require 'restclient' +require 'json' + +# This example demonstrates a fairly complicated PDF-processing action, designed +# to extract the PDF's text, and produce GIF versions of each page. The action +# (actions/process_pdfs.rb) shows an example of using all three steps, +# split, process, and merge. + RestClient.post( 'http://localhost:9173/jobs', {:job => { diff --git a/examples/word_count_example.rb b/examples/word_count_example.rb new file mode 100755 index 0000000..7f3599b --- /dev/null +++ b/examples/word_count_example.rb @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby -rubygems + +require 'restclient' +require 'json' + +# Let's count all the words in Shakespeare. + +RestClient.post('http://localhost:9173/jobs', + {:job => { + + 'action' => 'word_count', + + 'inputs' => [ + 'http://www.gutenberg.org/dirs/etext97/1ws3010.txt', # All's Well That Ends Well + 'http://www.gutenberg.org/dirs/etext99/1ws3511.txt', # Anthony and Cleopatra + 'http://www.gutenberg.org/dirs/etext97/1ws2510.txt', # As You Like It + 'http://www.gutenberg.org/dirs/etext97/1ws0610.txt', # The Comedy of Errors + 'http://www.gutenberg.org/dirs/etext99/1ws3911.txt', # Cymbeline + 'http://www.gutenberg.org/dirs/etext00/0ws2610.txt', # Hamlet + 'http://www.gutenberg.org/dirs/etext00/0ws1910.txt', # Henry IV + 'http://www.gutenberg.org/dirs/etext99/1ws2411.txt', # Julius Caesar + 'http://www.gutenberg.org/dirs/etext98/2ws3310.txt', # King Lear + 'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost + 'http://www.gutenberg.org/dirs/etext98/2ws3410.txt', # Macbeth + 'http://www.gutenberg.org/dirs/etext98/2ws1810.txt', # The Merchant of Venice + 'http://www.gutenberg.org/dirs/etext99/1ws1711.txt', # Midsummer Night's Dream + 'http://www.gutenberg.org/dirs/etext98/3ws2210.txt', # Much Ado About Nothing + 'http://www.gutenberg.org/dirs/etext00/0ws3210.txt', # Othello + 'http://www.gutenberg.org/dirs/etext98/2ws1610.txt', # Romeo and Juliet + 'http://www.gutenberg.org/dirs/etext98/2ws1010.txt', # The Taming of the Shrew + 'http://www.gutenberg.org/dirs/etext99/1ws4111.txt', # The Tempest + 'http://www.gutenberg.org/dirs/etext00/0ws0910.txt', # Titus Andronicus + 'http://www.gutenberg.org/dirs/etext99/1ws2911.txt', # Troilus and Cressida + 'http://www.gutenberg.org/dirs/etext98/3ws2810.txt', # Twelfth Night + 'http://www.gutenberg.org/files/1539/1539.txt' # The Winter's Tale + ] + + }.to_json} +) \ No newline at end of file diff --git a/lib/cloud_crowd/worker.rb b/lib/cloud_crowd/worker.rb index 0daacef..8594928 100644 --- a/lib/cloud_crowd/worker.rb +++ b/lib/cloud_crowd/worker.rb @@ -45,7 +45,7 @@ def complete_work_unit(result) keep_trying_to "complete work unit" do data = completion_params.merge({:status => 'succeeded', :output => result}) unit_json = @server["/work/#{data[:id]}"].put(data) - log "finished #{@action_name} in #{data[:time]} seconds" + log "finished #{display_work_unit} in #{data[:time]} seconds" clear_work_unit setup_work_unit(unit_json) end @@ -56,7 +56,7 @@ def fail_work_unit(exception) keep_trying_to "mark work unit as failed" do data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json}) unit_json = @server["/work/#{data[:id]}"].put(data) - log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}" + log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}" clear_work_unit setup_work_unit(unit_json) end @@ -82,6 +82,7 @@ def check_out :name => @name, :terminated => true }) + log 'exiting' end # We expect and require internal communication between the central server @@ -104,6 +105,11 @@ def has_work? @action_name && @input && @options end + # Loggable string of the current work unit. + def display_work_unit + "unit ##{@options['work_unit_id']} (#{@action_name})" + end + # Executes the current work unit, catching all exceptions as failures. def run_work_unit begin @@ -156,7 +162,7 @@ def setup_work_unit(unit_json) @options['job_id'] = unit['job_id'] @options['work_unit_id'] = unit['id'] @options['attempts'] ||= unit['attempts'] - log "fetched work unit ##{@options['work_unit_id']} for #{@action_name}" + log "fetched #{display_work_unit}" return true end