Skip to content

Commit

Permalink
adding a word-count action and example -- it downloads and counts all…
Browse files Browse the repository at this point in the history
… the words in Shakespeare in five seconds
  • Loading branch information
jashkenas committed Sep 4, 2009
1 parent d629ec1 commit a7df97c
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 17 deletions.
4 changes: 2 additions & 2 deletions actions/process_pdfs.rb
Expand Up @@ -6,8 +6,8 @@
# See <tt>examples/process_pdfs_example.rb</tt> for more information.
class ProcessPdfs < CloudCrowd::Action

# Split up a large pdf into single-page pdfs.
# The double pdftk shuffle fixes the document xrefs.
# Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
# chunks for processing. The double pdftk shuffle fixes the document xrefs.
def split
`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
FileUtils.rm input_path
Expand Down
14 changes: 14 additions & 0 deletions actions/word_count.rb
@@ -0,0 +1,14 @@
# A WordCount, the canonical MapReduce Demo. Depends on the 'wc' utility.
class WordCount < CloudCrowd::Action

# Count the words in a single book.
def process
(`wc #{input_path}`).match(/\A\s*(\d+)/)[1].to_i
end

# Sum the total word count.
def merge
JSON.parse(input).inject(0) {|sum, count| sum + count }
end

end
2 changes: 2 additions & 0 deletions cloud-crowd.gemspec
Expand Up @@ -47,13 +47,15 @@ Gem::Specification.new do |s|
s.files = %w(
actions/graphics_magick.rb
actions/process_pdfs.rb
actions/word_count.rb
cloud-crowd.gemspec
config/config.example.ru
config/config.example.yml
config/database.example.yml
EPIGRAPHS
examples/graphics_magick_example.rb
examples/process_pdfs_example.rb
examples/word_count_example.rb
lib/cloud-crowd.rb
lib/cloud_crowd/action.rb
lib/cloud_crowd/app.rb
Expand Down
20 changes: 8 additions & 12 deletions examples/graphics_magick_example.rb 100644 → 100755
@@ -1,11 +1,13 @@
# Inside of a restclient session:
# This is a fancy example that produces black and white, annotated, and blurred
# versions of a list of URLs downloaded from the web.
#!/usr/bin/env ruby -rubygems

require 'restclient'
require 'json'

RestClient.post(
'http://localhost:9173/jobs',
# This example demonstrates the GraphicsMagick action by taking in a list of
# five images, and producing annotated, blurred, and black and white versions
# of each image. See actions/graphics_magick.rb

RestClient.post('http://localhost:9173/jobs',
{:job => {

'action' => 'graphics_magick',
Expand Down Expand Up @@ -39,10 +41,4 @@
}

}.to_json}
)

# status = RestClient.get('http://localhost:9173/jobs/[job_id]')

# puts JSON.parse(RestClient.get('http://localhost:9173/jobs/[job_id]'))['outputs'].values.map {|v|
# JSON.parse(v).map {|v| v['url']}
# }.flatten.join("\n")
)
10 changes: 10 additions & 0 deletions examples/process_pdfs_example.rb 100644 → 100755
@@ -1,3 +1,13 @@
#!/usr/bin/env ruby -rubygems

require 'restclient'
require 'json'

# This example demonstrates a fairly complicated PDF-processing action, designed
# to extract the PDF's text, and produce GIF versions of each page. The action
# (actions/process_pdfs.rb) shows an example of using all three steps,
# split, process, and merge.

RestClient.post(
'http://localhost:9173/jobs',
{:job => {
Expand Down
39 changes: 39 additions & 0 deletions examples/word_count_example.rb
@@ -0,0 +1,39 @@
#!/usr/bin/env ruby -rubygems

require 'restclient'
require 'json'

# Let's count all the words in Shakespeare.

RestClient.post('http://localhost:9173/jobs',
{:job => {

'action' => 'word_count',

'inputs' => [
'http://www.gutenberg.org/dirs/etext97/1ws3010.txt', # All's Well That Ends Well
'http://www.gutenberg.org/dirs/etext99/1ws3511.txt', # Anthony and Cleopatra
'http://www.gutenberg.org/dirs/etext97/1ws2510.txt', # As You Like It
'http://www.gutenberg.org/dirs/etext97/1ws0610.txt', # The Comedy of Errors
'http://www.gutenberg.org/dirs/etext99/1ws3911.txt', # Cymbeline
'http://www.gutenberg.org/dirs/etext00/0ws2610.txt', # Hamlet
'http://www.gutenberg.org/dirs/etext00/0ws1910.txt', # Henry IV
'http://www.gutenberg.org/dirs/etext99/1ws2411.txt', # Julius Caesar
'http://www.gutenberg.org/dirs/etext98/2ws3310.txt', # King Lear
'http://www.gutenberg.org/dirs/etext99/1ws1211j.txt', # Love's Labour's Lost
'http://www.gutenberg.org/dirs/etext98/2ws3410.txt', # Macbeth
'http://www.gutenberg.org/dirs/etext98/2ws1810.txt', # The Merchant of Venice
'http://www.gutenberg.org/dirs/etext99/1ws1711.txt', # Midsummer Night's Dream
'http://www.gutenberg.org/dirs/etext98/3ws2210.txt', # Much Ado About Nothing
'http://www.gutenberg.org/dirs/etext00/0ws3210.txt', # Othello
'http://www.gutenberg.org/dirs/etext98/2ws1610.txt', # Romeo and Juliet
'http://www.gutenberg.org/dirs/etext98/2ws1010.txt', # The Taming of the Shrew
'http://www.gutenberg.org/dirs/etext99/1ws4111.txt', # The Tempest
'http://www.gutenberg.org/dirs/etext00/0ws0910.txt', # Titus Andronicus
'http://www.gutenberg.org/dirs/etext99/1ws2911.txt', # Troilus and Cressida
'http://www.gutenberg.org/dirs/etext98/3ws2810.txt', # Twelfth Night
'http://www.gutenberg.org/files/1539/1539.txt' # The Winter's Tale
]

}.to_json}
)
12 changes: 9 additions & 3 deletions lib/cloud_crowd/worker.rb
Expand Up @@ -45,7 +45,7 @@ def complete_work_unit(result)
keep_trying_to "complete work unit" do
data = completion_params.merge({:status => 'succeeded', :output => result})
unit_json = @server["/work/#{data[:id]}"].put(data)
log "finished #{@action_name} in #{data[:time]} seconds"
log "finished #{display_work_unit} in #{data[:time]} seconds"
clear_work_unit
setup_work_unit(unit_json)
end
Expand All @@ -56,7 +56,7 @@ def fail_work_unit(exception)
keep_trying_to "mark work unit as failed" do
data = completion_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
unit_json = @server["/work/#{data[:id]}"].put(data)
log "failed #{@action_name} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
clear_work_unit
setup_work_unit(unit_json)
end
Expand All @@ -82,6 +82,7 @@ def check_out
:name => @name,
:terminated => true
})
log 'exiting'
end

# We expect and require internal communication between the central server
Expand All @@ -104,6 +105,11 @@ def has_work?
@action_name && @input && @options
end

# Loggable string of the current work unit.
def display_work_unit
"unit ##{@options['work_unit_id']} (#{@action_name})"
end

# Executes the current work unit, catching all exceptions as failures.
def run_work_unit
begin
Expand Down Expand Up @@ -156,7 +162,7 @@ def setup_work_unit(unit_json)
@options['job_id'] = unit['job_id']
@options['work_unit_id'] = unit['id']
@options['attempts'] ||= unit['attempts']
log "fetched work unit ##{@options['work_unit_id']} for #{@action_name}"
log "fetched #{display_work_unit}"
return true
end

Expand Down

0 comments on commit a7df97c

Please sign in to comment.