Permalink
Browse files

Finally scraping the shitty HTML

  • Loading branch information...
1 parent 7240db7 commit 7d0bc9af83f2a9269500f74ccbca71d4c5a27e5d @chendo committed Dec 7, 2011
Showing with 67 additions and 11 deletions.
  1. +27 −0 lib/metlinkr/journey.rb
  2. +18 −8 lib/metlinkr/step.rb
  3. +1 −1 lib/metlinkr/version.rb
  4. +4 −2 metlinkr.gemspec
  5. +17 −0 spec/journey_spec.rb
@@ -0,0 +1,27 @@
+require 'nokogiri'
+class Metlinkr
+ class Journey
+ attr_accessor :steps
+
+ def self.parse(html)
+ journey = new
+
+ doc = Nokogiri::HTML(html)
+
+ rows = doc.xpath("//table[@text-align='top']/tr")
+
+ rows.shift # Get rid of header row
+
+ if rows.length % 3 != 0
+ raise "Rows not a multiple of 3"
+ end
+
+ journey.steps = []
+ rows.each_slice(3) do |row_set|
+ journey.steps << Step.parse(row_set)
+ end
+
+ journey
+ end
+ end
+end
View
@@ -18,37 +18,47 @@ def parse(row_set)
parse_arrival_time
parse_duration
+ @row_set = nil
+
self
end
protected
def parse_method
- @method = case @row_set[0].xpath("//td[1]/img").first.attributes['alt'].value
- when /tram/i
+ @method = case @row_set[0].xpath("td[1]/img").first.attributes['alt'].value
+ when /\btram\b/i
:tram
+ when /\btrain\b/i
+ :train
+ when /\bbus\b/i
+ :bus
+ when /\bwalk\b/i
+ :walk
+ else
+ nil
end
end
def parse_origin
- @origin = clean_stop_name(@row_set[0].xpath("td/strong/a").first.content)
+ @origin = clean_stop_name(@row_set[0].xpath("td")[3].content)
end
def parse_destination
- @destination = clean_stop_name(@row_set[2].xpath("td/a").first.content)
+ @destination = clean_stop_name(@row_set[2].xpath("td")[3].content)
end
def parse_route
- @route = @row_set[1].xpath("td/strong").first.content.strip
+ @route = @row_set[1].xpath("td/strong").first.content.strip rescue nil
end
def parse_departure_time
- @departure_time = clean_time(@row_set[0].xpath("td/span").first.content)
+ @departure_time = clean_time(@row_set[0].xpath("td/span").first.content) rescue nil
end
def parse_arrival_time
# Why the FUCK is that div there?
- @arrival_time = clean_time(@row_set[2].xpath("td/div/span").first.content)
+ @arrival_time = clean_time(@row_set[2].xpath("td/div/span").first.content) rescue nil
end
def parse_duration
@@ -57,7 +67,7 @@ def parse_duration
end
def clean_stop_name(stop)
- stop.gsub!(/(\d+)-/, 'Stop \1 - ').strip
+ stop.gsub(/^(From Stop)|(Get off at( stop)?)|(To)\b/i, '').gsub(/(\d+)-/, 'Stop \1 - ').strip
end
def clean_time(time)
@@ -1,3 +1,3 @@
-module Metlinkr
+class Metlinkr
VERSION = "0.0.1"
end
View
@@ -19,6 +19,8 @@ Gem::Specification.new do |s|
s.require_paths = ["lib"]
# specify any dependencies here; for example:
- # s.add_development_dependency "rspec"
- # s.add_runtime_dependency "rest-client"
+ s.add_development_dependency "rspec"
+ s.add_runtime_dependency "capybara-mechanize"
+ s.add_runtime_dependency "nokogiri"
+
end
View
@@ -0,0 +1,17 @@
+require 'spec_helper'
+
+describe Metlinkr::Journey do
+ let(:raw_shitty_html) do
+ File.read(File.dirname(__FILE__) + "/fixtures/multiple_journey.html")
+ end
+
+ subject do
+ Metlinkr::Journey.parse(raw_shitty_html)
+ end
+
+ it "parses journey effectively" do
+ subject.steps.length.should == 7
+
+ pp subject.steps
+ end
+end

0 comments on commit 7d0bc9a

Please sign in to comment.