Skip to content

Commit

Permalink
feat(html2rss): add initial version of the html2rss gem
Browse files Browse the repository at this point in the history
  • Loading branch information
gildesmarais committed Jun 3, 2018
1 parent c49f982 commit 219cac8
Show file tree
Hide file tree
Showing 19 changed files with 683 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
@@ -0,0 +1,11 @@
/.bundle/
/.yardoc
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/tmp/

# rspec failure tracking
.rspec_status
4 changes: 4 additions & 0 deletions .rspec
@@ -0,0 +1,4 @@
--format documentation
--color
--order random
--require spec_helper
6 changes: 6 additions & 0 deletions Gemfile
@@ -0,0 +1,6 @@
source 'https://rubygems.org'

git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }

# Specify your gem's dependencies in html2rss.gemspec
gemspec
53 changes: 53 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,53 @@
PATH
remote: .
specs:
html2rss (0.1.0)
faraday (~> 0.15)
nokogiri (~> 1.8)
sanitize (~> 4.6)

GEM
remote: https://rubygems.org/
specs:
byebug (10.0.2)
crass (1.0.4)
diff-lcs (1.3)
faraday (0.15.2)
multipart-post (>= 1.2, < 3)
mini_portile2 (2.3.0)
multipart-post (2.0.0)
nokogiri (1.8.2)
mini_portile2 (~> 2.3.0)
nokogumbo (1.5.0)
nokogiri
rspec (3.7.0)
rspec-core (~> 3.7.0)
rspec-expectations (~> 3.7.0)
rspec-mocks (~> 3.7.0)
rspec-core (3.7.1)
rspec-support (~> 3.7.0)
rspec-expectations (3.7.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.7.0)
rspec-mocks (3.7.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.7.0)
rspec-support (3.7.1)
sanitize (4.6.5)
crass (~> 1.0.2)
nokogiri (>= 1.4.4)
nokogumbo (~> 1.4)
vcr (4.0.0)

PLATFORMS
ruby

DEPENDENCIES
bundler (~> 1.16)
byebug
html2rss!
rspec (~> 3.0)
vcr (~> 4.0)

BUNDLED WITH
1.16.2
45 changes: 45 additions & 0 deletions README.md
@@ -0,0 +1,45 @@
# Html2rss

Requests and convert an HTML document to an RSS feed via a config object.
The config contains the URL to scrape and the selectors needed to extract
the required information. This gem provides some extractors (e.g. extract
the information from an HTML attribute).

Please always check the website's Terms of Service before if its allowed to
scrape their content!

## Installation

Add this line to your application's Gemfile:

```ruby
gem 'html2rss'
```

And then execute:

$ bundle

Or install it yourself as:

$ gem install html2rss

## Usage example with a YAML file

Create a YAML config file. Find an example at `rspec/config.test.yml`.

`Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')` returns

an `RSS:Rss` object.

## Development

After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

## Contributing

Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.

## License

The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
15 changes: 15 additions & 0 deletions bin/console
@@ -0,0 +1,15 @@
#!/usr/bin/env ruby

require 'bundler/setup'
require 'html2rss'
require 'byebug'

# You can add fixtures and/or initialization code here to make experimenting
# with your gem easier. You can also use a different console, if you like.

# (If you use this, don't forget to add pry to your Gemfile!)
# require "pry"
# Pry.start

require 'irb'
IRB.start(__FILE__)
8 changes: 8 additions & 0 deletions bin/setup
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
set -vx

bundle install

# Do any other automated setup that you need to do here
40 changes: 40 additions & 0 deletions html2rss.gemspec
@@ -0,0 +1,40 @@
lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'html2rss/version'

Gem::Specification.new do |spec|
spec.name = 'html2rss'
spec.version = Html2rss::VERSION
spec.authors = ['Gil Desmarais']
spec.email = ['html2rss@desmarais.de']

spec.summary = 'Generate RSS feeds by scraping websites by providing a config.'
spec.description = 'Create your config object, include the url to scrape,
some selectors and get a RSS2 feed in return.'
spec.homepage = 'https://github.com/gildesmarais/html2rss'
spec.license = 'MIT'

# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
# to allow pushing to a single host or delete this section to allow pushing to any host.
if spec.respond_to?(:metadata)
spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
else
raise 'RubyGems 2.0 or newer is required to protect against ' \
'public gem pushes.'
end

spec.files = `git ls-files -z`.split("\x0").reject do |f|
f.match(%r{^(test|spec|features)/})
end
spec.bindir = 'exe'
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
spec.require_paths = ['lib']

spec.add_dependency 'nokogiri', '~> 1.8'
spec.add_dependency 'sanitize', '~> 4.6'
spec.add_dependency 'faraday', '~> 0.15'
spec.add_development_dependency 'bundler', '~> 1.16'
spec.add_development_dependency 'rspec', '~> 3.0'
spec.add_development_dependency 'vcr', '~> 4.0'
spec.add_development_dependency 'byebug'
end
16 changes: 16 additions & 0 deletions lib/html2rss.rb
@@ -0,0 +1,16 @@
require 'html2rss/config'
require 'html2rss/feed_builder'
require 'html2rss/version'
require 'yaml'

module Html2rss
def self.feed_from_yaml_config(file, name)
config = Config.new(YAML.load(File.open(file))['feeds'].freeze, name)
feed(config)
end

def self.feed(config)
feed = FeedBuilder.new config
feed.rss
end
end
54 changes: 54 additions & 0 deletions lib/html2rss/config.rb
@@ -0,0 +1,54 @@
module Html2rss
class Config
attr_reader :feed_config, :channel_config

def initialize(config, name)
@config = config
@feed_config = @config['feeds'][name.to_s]
@channel_config = @feed_config['channel']
end

def author
channel_config.fetch 'author', 'html2rss'
end

def ttl
(channel_config.fetch 'ttl').to_i || nil
end

def title
channel_config.fetch 'title', 'html2rss generated title'
end

def language
channel_config.fetch 'language', 'en'
end

def description
channel_config.fetch 'description', 'A description of my html2rss feed.'
end

def url
channel_config.dig 'url'
end
alias link url

def headers
@config.fetch('headers', {})
end

def options(name)
feed_config.dig('selectors', name).merge('channel' => channel_config)
end

def selector(name)
feed_config.dig('selectors', name, 'selector')
end

def attribute_names
attribute_names = feed_config.fetch('selectors', {}).keys.map(&:to_sym)
attribute_names.delete(:items)
attribute_names
end
end
end
47 changes: 47 additions & 0 deletions lib/html2rss/feed_builder.rb
@@ -0,0 +1,47 @@
require 'rss'
require_relative 'item'

module Html2rss
class FeedBuilder
attr_reader :config

def initialize(feed_config)
@config = feed_config
end

def rss
RSS::Maker.make('2.0') do |maker|
add_channel_to_maker(maker)

feed_items.map do |feed_item|
add_item_to_items(feed_item, maker.items)
end
end
end

private

def add_channel_to_maker(maker)
[:language, :author, :title, :description, :link, :ttl].each do |attribute_name|
maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
end

maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
maker.channel.lastBuildDate = Time.now.to_s
end

def feed_items
Item.from_url config.url, config
end

def add_item_to_items(feed_item, items)
items.new_item do |rss_item|
config.attribute_names.each do |attribute_name|
rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))

rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
end
end
end
end
end
49 changes: 49 additions & 0 deletions lib/html2rss/item.rb
@@ -0,0 +1,49 @@
require 'faraday'
require 'open-uri'
require 'nokogiri'
require_relative 'item_extractor'

module Html2rss
class Item
attr_reader :xml, :config

def initialize(xml, config)
@xml = xml
@config = config
end

def respond_to_missing?(method_name, _include_private = false)
config.attribute_names.include?(method_name) || super
end

def method_missing(method_name, *_args)
attribute_config = config.options(method_name.to_s)
return super unless attribute_config

extractor = attribute_config['extractor'] || 'text'
proc = ItemExtractor.const_get extractor.upcase.to_sym
value = proc.call(xml, attribute_config)

post_process(method_name, value)
end

def post_process(method_name, value)
case method_name
when :link
URI(value)
when :updated
Time.parse(value).to_s
else
value
end
end

def self.from_url(url, config)
connection = Faraday.new(url: url, headers: config.headers)
page = Nokogiri::HTML(connection.get.body)
page.css(config.selector('items')).map { |xml_item|
new xml_item, config
}
end
end
end
25 changes: 25 additions & 0 deletions lib/html2rss/item_extractor.rb
@@ -0,0 +1,25 @@
require 'sanitize'

module Html2rss
module ItemExtractor
TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }

HREF = proc { |xml, options|
uri = URI(options['channel']['url'])
uri.path = xml.css(options['selector']).attr('href')
uri
}

HTML = proc { |xml, options|
html = xml.css(options['selector']).to_s

Sanitize.fragment(html, Sanitize::Config.merge(
Sanitize::Config::RELAXED,
add_attributes: {
'a' => { 'rel' => 'nofollow noopener noreferrer' }
}
))
}
end
end
3 changes: 3 additions & 0 deletions lib/html2rss/version.rb
@@ -0,0 +1,3 @@
module Html2rss
VERSION = '0.0.1'.freeze
end

0 comments on commit 219cac8

Please sign in to comment.