diff --git a/Gemfile b/Gemfile index 162e1fa5..5b40d858 100644 --- a/Gemfile +++ b/Gemfile @@ -7,6 +7,7 @@ gem 'lumberjack', '~> 1.0' gem 'sysexits', '~> 1.2' gem 'rest-client', '~> 1.8' gem 'nokogiri', '~> 1.6' +gem 'charlock_holmes', '~> 0.7' group :development, :test do gem 'pry', '~> 0.10' diff --git a/Gemfile.lock b/Gemfile.lock index 2842e4cf..c14e6c0d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -12,6 +12,7 @@ GEM astrolabe (1.3.0) parser (>= 2.2.0.pre.3, < 3.0) buftok (0.2.0) + charlock_holmes (0.7.3) cinch (2.2.5) coderay (1.1.0) coveralls (0.8.0) @@ -122,6 +123,7 @@ PLATFORMS DEPENDENCIES activesupport (~> 4.2) + charlock_holmes (~> 0.7) cinch coveralls lumberjack (~> 1.0) diff --git a/lib/rgrb/plugin/url_fetch_title/generator.rb b/lib/rgrb/plugin/url_fetch_title/generator.rb index 5ac36343..50990623 100644 --- a/lib/rgrb/plugin/url_fetch_title/generator.rb +++ b/lib/rgrb/plugin/url_fetch_title/generator.rb @@ -6,6 +6,7 @@ require 'nokogiri' require 'active_support' require 'active_support/core_ext/numeric/conversions' +require 'charlock_holmes' require 'rgrb/version' require 'rgrb/plugin/configurable_generator' require 'rgrb/plugin/url_fetch_title/max_redirects_reached' @@ -58,7 +59,7 @@ def fetch_title(url) case content_type when 'text/html', 'application/xhtml+xml' - title = extract_title(http_get(url)) + title = extract_title(http_get(url), url) title_text = title.empty? ? '(タイトルなし)' : title if n_redirects > 0 @@ -156,9 +157,15 @@ def http_get(url) # HTML からタイトルを抽出する # @param [String] body HTML コード + # @param [String] url URL # @return [String] タイトル。存在しない場合は空文字列 - def extract_title(body) - doc = Nokogiri::HTML(body) + def extract_title(body, url = nil) + detection = CharlockHolmes::EncodingDetector.detect(body) + fail( + ArgumentError, 'エンコーディングを判別できませんでした' + ) unless detection + + doc = Nokogiri::HTML(body, url, detection[:encoding]) title_element = doc.at_css('title') title_element ? title_element.content : '' diff --git a/spec/rgrb/plugin/url_fetch_title/data/cre_ne_jp-shift_jis.html b/spec/rgrb/plugin/url_fetch_title/data/cre_ne_jp-shift_jis.html new file mode 100644 index 00000000..2561b4b1 --- /dev/null +++ b/spec/rgrb/plugin/url_fetch_title/data/cre_ne_jp-shift_jis.html @@ -0,0 +1,10 @@ + + + + + NGC^[Ylbg[N + + +

NGC^[Ylbg[N z[y[W

+ + diff --git a/spec/rgrb/plugin/url_fetch_title/generator_spec.rb b/spec/rgrb/plugin/url_fetch_title/generator_spec.rb index 35726739..118d0d83 100644 --- a/spec/rgrb/plugin/url_fetch_title/generator_spec.rb +++ b/spec/rgrb/plugin/url_fetch_title/generator_spec.rb @@ -40,7 +40,7 @@ let(:default_prefix) { 'Fetch title: ' } let(:default_suffix) { '[end]' } let(:format) do - lambda { |str| "#{default_prefix}#{str}#{default_suffix}" } + ->str { "#{default_prefix}#{str}#{default_suffix}" } end before do @@ -79,6 +79,31 @@ end end + context 'HTML ファイル (Shift_JIS)' do + let(:url) { 'http://www.cre.ne.jp/' } + let(:html_path) do + File.expand_path('data/cre_ne_jp-shift_jis.html', File.dirname(__FILE__)) + end + let(:body) { File.read(html_path) } + + before do + response = { + status: 200, + headers: { + 'Content-Type' => 'text/html', + } + } + stub_request(:head, url).to_return(response) + stub_request(:get, url). + to_return(response.merge(body: body)) + end + + subject { generator.fetch_title(url) } + it 'エンコーディングが適切に変換された タグの内容を含む' do + expect(subject).to eq(format['クリエイターズネットワーク']) + end + end + context 'HTML ファイル(リダイレクト)' do let(:status_codes) { [301, 302, 303, 307, 200] } let(:url) do