Permalink
Browse files

Seed Wikipedia data to couchbase in batch t:175

* add delayed_job for async processing
* finish Wikipedia class
* add WikipediaDownloadJob
  • Loading branch information...
1 parent 2c66e76 commit b2d7604910b5ba14eb8bb240a5f175e65edae7e2 @davidjrice davidjrice committed May 14, 2012
View
1 Gemfile
@@ -5,6 +5,7 @@ gem 'typhoeus'
# Bundle edge Rails instead:
# gem 'rails', :git => 'git://github.com/rails/rails.git'
gem 'thin'
+gem 'delayed_job_active_record'
gem 'couchbase', '1.2.0.dp'
View
6 Gemfile.lock
@@ -58,6 +58,11 @@ GEM
yaji (~> 0.2.3)
yajl-ruby (~> 1.1.0)
daemons (1.1.8)
+ delayed_job (3.0.1)
+ activesupport (~> 3.0)
+ delayed_job_active_record (0.3.2)
+ activerecord (> 2.1.0)
+ delayed_job (~> 3.0.0)
diff-lcs (1.1.3)
erubis (2.7.0)
eventmachine (0.12.10)
@@ -194,6 +199,7 @@ DEPENDENCIES
ci_reporter
coffee-rails (~> 3.2.1)
couchbase (= 1.2.0.dp)
+ delayed_job_active_record
factory_girl_rails (~> 1.2)
foreman
heroku
View
3 Procfile
@@ -1,2 +1,3 @@
web: bundle exec rails server thin -p $PORT
-es: elasticsearch -f -D es.config=/usr/local/Cellar/elasticsearch/0.19.2/config/elasticsearch.yml
+es: elasticsearch -f -D es.config=/usr/local/Cellar/elasticsearch/0.19.3/config/elasticsearch.yml
+worker: bundle exec rake jobs:work
View
16 app/jobs/wikipedia_download_job.rb
@@ -0,0 +1,16 @@
+class WikipediaDownloadJob
+
+ def initialize(article_ids)
+ @article_ids = article_ids
+ end
+
+ def perform
+ articles = Wikipedia.fetch( @article_ids )
+ couchbase = Couchbase.connect(ENV["COUCHBASE_URL"])
+ articles.each do |article|
+ id, document = Wikipedia.parse( article )
+ couchbase.set(id.to_s, document)
+ end
+ end
+
+end
View
37 app/models/wikipedia.rb
@@ -1,7 +1,9 @@
class Wikipedia
- BASE_URL = "http://en.wikipedia.org/w/api.php"
- BATCH = 10
+ ARTICLE_TYPES = ["text", "video", "audio"]
+ BASE_URL = "http://en.wikipedia.org/w/api.php"
+ BATCH = 10
+ BATCHES = 10
# return an array of random wikipedia article references
def self.random
@@ -36,10 +38,39 @@ def self.fetch(article_ids)
JSON.parse(response.body)["query"]["pages"].map {|key, value| value }
end
- def self.seed!(number)
+ def self.seed!
# 1. iterate through batches of random articles up to number
# 2. cache ID so no duplicate requests
# 3. schedule delayed job for download of each article
+ (1..BATCHES).each do |batch|
+ Delayed::Job.enqueue( WikipediaDownloadJob.new( self.random ) )
+ end
+ end
+
+
+ def self.parse(json)
+ id = json["pageid"]
+ categories = json["categories"].map {|c| c['title'].split(':').last }
+
+ random_type = rand(3)
+ random_quality = rand(100) + 1;
+
+ revision = json["revisions"].first
+
+ document = {
+ :title => json['title'],
+ :url => json['fullurl'],
+ :type => ARTICLE_TYPES[random_type],
+ :is_text => (random_type == 0),
+ :is_video => (random_type == 1),
+ :is_audio => (random_type == 2),
+ :quality => random_quality,
+ :categories => categories,
+ :timestamp => revision['timestamp'],
+ :content => revision['*'],
+ :user => revision['user']
+ }
+ return id, document
end
end
View
1 config/application.rb
@@ -17,6 +17,7 @@ class Application < Rails::Application
# Custom directories with classes and modules you want to be autoloadable.
# config.autoload_paths += %W(#{config.root}/extras)
+ config.autoload_paths += %W(#{config.root}/app/jobs)
# Only load the plugins named here, in the order given (default is alphabetical).
# :all can be used as a placeholder for all plugins not explicitly named.
View
22 db/migrate/20120514155616_create_delayed_jobs.rb
@@ -0,0 +1,22 @@
+class CreateDelayedJobs < ActiveRecord::Migration
+ def self.up
+ create_table :delayed_jobs, :force => true do |table|
+ table.integer :priority, :default => 0 # Allows some jobs to jump to the front of the queue
+ table.integer :attempts, :default => 0 # Provides for retries, but still fail eventually.
+ table.text :handler # YAML-encoded string of the object that will do work
+ table.text :last_error # reason for last failure (See Note below)
+ table.datetime :run_at # When to run. Could be Time.zone.now for immediately, or sometime in the future.
+ table.datetime :locked_at # Set when a client is working on this object
+ table.datetime :failed_at # Set when all retries have failed (actually, by default, the record is deleted instead)
+ table.string :locked_by # Who is working on this object (if locked)
+ table.string :queue # The name of the queue this job is in
+ table.timestamps
+ end
+
+ add_index :delayed_jobs, [:priority, :run_at], :name => 'delayed_jobs_priority'
+ end
+
+ def self.down
+ drop_table :delayed_jobs
+ end
+end
View
32 db/schema.rb
@@ -0,0 +1,32 @@
+# encoding: UTF-8
+# This file is auto-generated from the current state of the database. Instead
+# of editing this file, please use the migrations feature of Active Record to
+# incrementally modify your database, and then regenerate this schema definition.
+#
+# Note that this schema.rb definition is the authoritative source for your
+# database schema. If you need to create the application database on another
+# system, you should be using db:schema:load, not running all the migrations
+# from scratch. The latter is a flawed and unsustainable approach (the more migrations
+# you'll amass, the slower it'll run and the greater likelihood for issues).
+#
+# It's strongly recommended to check this file into your version control system.
+
+ActiveRecord::Schema.define(:version => 20120514155616) do
+
+ create_table "delayed_jobs", :force => true do |t|
+ t.integer "priority", :default => 0
+ t.integer "attempts", :default => 0
+ t.text "handler"
+ t.text "last_error"
+ t.datetime "run_at"
+ t.datetime "locked_at"
+ t.datetime "failed_at"
+ t.string "locked_by"
+ t.string "queue"
+ t.datetime "created_at", :null => false
+ t.datetime "updated_at", :null => false
+ end
+
+ add_index "delayed_jobs", ["priority", "run_at"], :name => "delayed_jobs_priority"
+
+end
View
5 script/delayed_job
@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.join(File.dirname(__FILE__), '..', 'config', 'environment'))
+require 'delayed/command'
+Delayed::Command.new(ARGV).daemonize

0 comments on commit b2d7604

Please sign in to comment.