Skip to content

Commit

Permalink
Gathering links and emails works
Browse files Browse the repository at this point in the history
  • Loading branch information
changs committed Oct 2, 2011
1 parent e8bb79e commit c514d74
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 9 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Expand Up @@ -3,3 +3,5 @@ source "http://rubygems.org"
gem "anemone"
gem "nokogiri"
gem "sinatra"
gem 'rest-client'
gem 'data_mapper'
46 changes: 46 additions & 0 deletions Gemfile.lock
@@ -1,24 +1,70 @@
GEM
remote: http://rubygems.org/
specs:
addressable (2.2.6)
anemone (0.6.1)
nokogiri (>= 1.3.0)
robots (>= 0.7.2)
bcrypt-ruby (2.1.4)
data_mapper (1.1.0)
dm-aggregates (= 1.1.0)
dm-constraints (= 1.1.0)
dm-core (= 1.1.0)
dm-migrations (= 1.1.0)
dm-serializer (= 1.1.0)
dm-timestamps (= 1.1.0)
dm-transactions (= 1.1.0)
dm-types (= 1.1.0)
dm-validations (= 1.1.0)
dm-aggregates (1.1.0)
dm-core (~> 1.1.0)
dm-constraints (1.1.0)
dm-core (~> 1.1.0)
dm-core (1.1.0)
addressable (~> 2.2.4)
dm-migrations (1.1.0)
dm-core (~> 1.1.0)
dm-serializer (1.1.0)
dm-core (~> 1.1.0)
fastercsv (~> 1.5.4)
json (~> 1.4.6)
dm-timestamps (1.1.0)
dm-core (~> 1.1.0)
dm-transactions (1.1.0)
dm-core (~> 1.1.0)
dm-types (1.1.0)
bcrypt-ruby (~> 2.1.4)
dm-core (~> 1.1.0)
fastercsv (~> 1.5.4)
json (~> 1.4.6)
stringex (~> 1.2.0)
uuidtools (~> 2.1.2)
dm-validations (1.1.0)
dm-core (~> 1.1.0)
fastercsv (1.5.4)
json (1.4.6)
mime-types (1.16)
nokogiri (1.5.0)
rack (1.3.3)
rack-protection (1.1.1)
rack
rest-client (1.6.3)
mime-types (>= 1.16)
robots (0.10.1)
sinatra (1.3.0)
rack (~> 1.3)
rack-protection (~> 1.1)
tilt (~> 1.3)
stringex (1.2.2)
tilt (1.3.3)
uuidtools (2.1.2)

PLATFORMS
ruby

DEPENDENCIES
anemone
data_mapper
nokogiri
rest-client
sinatra
44 changes: 35 additions & 9 deletions crawler/crawler.rb
Expand Up @@ -5,14 +5,23 @@

require 'anemone'
require 'nokogiri'
require 'rest_client'
require 'json'
require 'set'

domain = ARGV[0]
email_regex = /[\w+\-.]+@[a-z\d\-.]+\.[a-z]+/i
arr_mails = Array.new
email_regex2 = /([\w+\-.]+) \[ at \] ([a-z\d\-.]+\.[a-z]+)/i

server_url = 'http://127.0.0.1:4567'
response = RestClient.get server_url + '/start'
params = JSON.parse(response)
domain = params["domain"]
arr_mails = Set.new
out_links = Array.new
links = Set.new

Anemone.crawl(domain) do |anemone|

anemone.focus_crawl do |page|
domain_links = page.links.select do |x|
x.to_s.downcase.include? domain.downcase
Expand All @@ -24,14 +33,31 @@
anemone.user_agent = "ChgCrawler"
anemone.on_every_page do |page|
puts page.url
next unless page.html?
page.doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
abs = page.to_absolute(URI(u)) rescue next
links << abs.to_s unless page.in_domain?(abs)
end

mails = page.body.scan(email_regex)
mails.each do |mail|
arr_mails.push(mail) unless arr_mails.index(mail)
end
end
mails.each { |mail| arr_mails.add(mail) }
mails = page.body.scan(email_regex2) # $1 is a content before @, $2 after.
mails.each { |mail| arr_mails.add($1 + '@' + $2) }

end
end

puts "Links: #{out_links.uniq!}"
puts "Links: #{links.to_a}"
puts "Emails found in #{domain}"
puts arr_mails

RestClient.post server_url + '/email',
{ 'emails' => arr_mails.to_a }.to_json, :content_type => :json, :accept => :json

RestClient.post server_url + '/links',
{ 'links' => links_to_a }.to_json, content_type: :json, accept: :json

p arr_mails.to_a


16 changes: 16 additions & 0 deletions server/server.rb
@@ -0,0 +1,16 @@
require 'sinatra'
require 'data_mapper'

get '/' do
'Hello world!'
end

get '/start' do
content_type :json
{ :domain => 'http://www.put.poznan.pl/', :key2 => 'value2' }.to_json
end

post '/email' do
c = JSON.parse(request.body.read)
puts "Received: #{c}"
end

0 comments on commit c514d74

Please sign in to comment.