Skip to content

Commit

Permalink
UrlFetchTitle: title のエンコーディングを適切に変換する
Browse files Browse the repository at this point in the history
fixed #44

charlock_holmes という gem を使って文字コードを自動判別し、Nokogiri
にそれを伝える。
文字コードの UTF-8 への変換は Nokogiri が自動的に行う。
  • Loading branch information
ochaochaocha3 committed Apr 15, 2015
1 parent 778003b commit 17e02cf
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 4 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ gem 'lumberjack', '~> 1.0'
gem 'sysexits', '~> 1.2'
gem 'rest-client', '~> 1.8'
gem 'nokogiri', '~> 1.6'
gem 'charlock_holmes', '~> 0.7'

group :development, :test do
gem 'pry', '~> 0.10'
Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ GEM
astrolabe (1.3.0)
parser (>= 2.2.0.pre.3, < 3.0)
buftok (0.2.0)
charlock_holmes (0.7.3)
cinch (2.2.5)
coderay (1.1.0)
coveralls (0.8.0)
Expand Down Expand Up @@ -122,6 +123,7 @@ PLATFORMS

DEPENDENCIES
activesupport (~> 4.2)
charlock_holmes (~> 0.7)
cinch
coveralls
lumberjack (~> 1.0)
Expand Down
13 changes: 10 additions & 3 deletions lib/rgrb/plugin/url_fetch_title/generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
require 'nokogiri'
require 'active_support'
require 'active_support/core_ext/numeric/conversions'
require 'charlock_holmes'
require 'rgrb/version'
require 'rgrb/plugin/configurable_generator'
require 'rgrb/plugin/url_fetch_title/max_redirects_reached'
Expand Down Expand Up @@ -58,7 +59,7 @@ def fetch_title(url)

case content_type
when 'text/html', 'application/xhtml+xml'
title = extract_title(http_get(url))
title = extract_title(http_get(url), url)
title_text = title.empty? ? '(タイトルなし)' : title

if n_redirects > 0
Expand Down Expand Up @@ -156,9 +157,15 @@ def http_get(url)

# HTML からタイトルを抽出する
# @param [String] body HTML コード
# @param [String] url URL
# @return [String] タイトル。存在しない場合は空文字列
def extract_title(body)
doc = Nokogiri::HTML(body)
def extract_title(body, url = nil)
detection = CharlockHolmes::EncodingDetector.detect(body)
fail(
ArgumentError, 'エンコーディングを判別できませんでした'
) unless detection

doc = Nokogiri::HTML(body, url, detection[:encoding])
title_element = doc.at_css('title')

title_element ? title_element.content : ''
Expand Down
10 changes: 10 additions & 0 deletions spec/rgrb/plugin/url_fetch_title/data/cre_ne_jp-shift_jis.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="Shift_JIS" />
<title>クリエイターズネットワーク</title>
</head>
<body>
<p><a href="http://www.cre.ne.jp/">クリエイターズネットワーク ホームページ</a></p>
</body>
</html>
27 changes: 26 additions & 1 deletion spec/rgrb/plugin/url_fetch_title/generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
let(:default_prefix) { 'Fetch title: ' }
let(:default_suffix) { '[end]' }
let(:format) do
lambda { |str| "#{default_prefix}#{str}#{default_suffix}" }
->str { "#{default_prefix}#{str}#{default_suffix}" }
end

before do
Expand Down Expand Up @@ -79,6 +79,31 @@
end
end

context 'HTML ファイル (Shift_JIS)' do
let(:url) { 'http://www.cre.ne.jp/' }
let(:html_path) do
File.expand_path('data/cre_ne_jp-shift_jis.html', File.dirname(__FILE__))
end
let(:body) { File.read(html_path) }

before do
response = {
status: 200,
headers: {
'Content-Type' => 'text/html',
}
}
stub_request(:head, url).to_return(response)
stub_request(:get, url).
to_return(response.merge(body: body))
end

subject { generator.fetch_title(url) }
it 'エンコーディングが適切に変換された <title> タグの内容を含む' do
expect(subject).to eq(format['クリエイターズネットワーク'])
end
end

context 'HTML ファイル(リダイレクト)' do
let(:status_codes) { [301, 302, 303, 307, 200] }
let(:url) do
Expand Down

0 comments on commit 17e02cf

Please sign in to comment.