Skip to content

Commit

Permalink
Fix crawler to skip duplicated items correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
iwamot committed Oct 9, 2014
1 parent 676b73e commit 64a2661
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lib/fastladder/crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def cut_off(items)
end

def reject_duplicated(feed, items)
items.reject { |item| feed.items.exists?(["guid = ? and digest = ?", item.id, item.digest]) }
items.uniq { |item| item.guid }.reject { |item| feed.items.exists?(["guid = ? and digest = ?", item.guid, item.digest]) }
end

def delete_old_items_if_new_items_are_many(new_items_size)
Expand Down
4 changes: 4 additions & 0 deletions spec/factories/items.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,8 @@
factory :item_without_guid, parent: :item do
guid nil
end

factory :item_has_fixed_guid, parent: :item do
guid "guid"
end
end
30 changes: 30 additions & 0 deletions spec/lib/fastladder/crawler_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
require 'spec_helper'

describe 'Fastladder::Crawler' do
let(:crawler) { Fastladder::Crawler.new(Rails.logger) }
let(:feed) { FactoryGirl.create(:feed) }

context 'when some items have same guid' do
let(:items) { FactoryGirl.build_list(:item_has_fixed_guid, 2) }

describe '#reject_duplicated' do
it 'takes the first of them' do
expect(crawler.send(:reject_duplicated, feed, items)).to eq(items.take(1))
end
end
end

context 'when items are duplicated' do
let(:items) { FactoryGirl.build_list(:item_has_fixed_guid, 1) }
before {
FactoryGirl.create(:item_has_fixed_guid, feed: feed)
items.each { |item| item.create_digest }
}

describe '#reject_duplicated' do
it 'rejects them' do
expect(crawler.send(:reject_duplicated, feed, items)).to be_empty
end
end
end
end

0 comments on commit 64a2661

Please sign in to comment.