UrlFetchTitle: RestClient による処理に書き換えた

fixed #22, fixed #42
cre-ne-jp · Apr 11, 2015 · 59fd184 · 59fd184
1 parent b358959
commit 59fd184
Show file tree

Hide file tree

Showing 2 changed files with 197 additions and 148 deletions.
diff --git a/lib/rgrb/plugin/url_fetch_title/generator.rb b/lib/rgrb/plugin/url_fetch_title/generator.rb
@@ -2,9 +2,8 @@
 
 require 'uri'
 require 'socket'
-require 'net/http'
-require 'net/http/persistent'
-require 'mechanize'
+require 'rest_client'
+require 'nokogiri'
 require 'active_support'
 require 'active_support/core_ext/numeric/conversions'
 require 'rgrb/version'
@@ -14,160 +13,164 @@ module RGRB
   module Plugin
     # ウェブページタイトル自動取得プラグイン
     module UrlFetchTitle
-      # UrlFetchTitle の出力テキスト生成器
+      # UrlFetchTitle の出力テキスト生成器。
+      #
+      # RestClient を利用して得た HTML コードを Nokogiri
+      # で解析し、title タグの内容を抽出する。
       class Generator
         include ConfigurableGenerator
 
-        # HTTP ステータスの Reason-Phrase
-        # @see https://www.iana.org/assignments/http-status-codes/http-status-codes-1.csv
-        # @see http://tools.ietf.org/html/rfc7231#section-6
-        HTTP_STATUS_REASON_PHRASE = {
-          100 => 'Continue',
-          101 => 'Switching Protocols',
-          102 => 'Processing',
-          200 => 'OK',
-          201 => 'Created',
-          202 => 'Accepted',
-          203 => 'Non-Authoritative Information',
-          204 => 'No Content',
-          205 => 'Reset Content',
-          206 => 'Partial Content',
-          207 => 'Multi-Status',
-          208 => 'Already Reported',
-          226 => 'IM Used',
-          300 => 'Multiple Choices',
-          301 => 'Moved Permanently',
-          302 => 'Found',
-          303 => 'See Other',
-          304 => 'Not Modified',
-          305 => 'Use Proxy',
-          307 => 'Temporary Redirect',
-          308 => 'Permanent Redirect',
-          400 => 'Bad Request',
-          401 => 'Unauthorized',
-          402 => 'Payment Required',
-          403 => 'Forbidden',
-          404 => 'Not Found',
-          405 => 'Method Not Allowed',
-          406 => 'Not Acceptable',
-          407 => 'Proxy Authentication Required',
-          408 => 'Request Timeout',
-          409 => 'Conflict',
-          410 => 'Gone',
-          411 => 'Length Required',
-          412 => 'Precondition Failed',
-          413 => 'Payload Too Large',
-          414 => 'URI Too Long',
-          415 => 'Unsupported Media Type',
-          416 => 'Range Not Satisfiable',
-          417 => 'Expectation Failed',
-          422 => 'Unprocessable Entity',
-          423 => 'Locked',
-          424 => 'Failed Dependency',
-          426 => 'Upgrade Required',
-          428 => 'Precondition Required',
-          429 => 'Too Many Requests',
-          431 => 'Request Header Fields Too Large',
-          500 => 'Internal Server Error',
-          501 => 'Not Implemented',
-          502 => 'Bad Gateway',
-          503 => 'Service Unavailable',
-          504 => 'Gateway Timeout',
-          505 => 'HTTP Version Not Supported',
-          506 => 'Variant Also Negotiates',
-          507 => 'Insufficient Storage',
-          508 => 'Loop Detected',
-          510 => 'Not Extended',
-          511 => 'Network Authentication Required'
-        }
-
+        # 設定データを解釈してプラグインの設定を行う
+        # @param [Hash] config_data プラグインの設定データ
+        # @return [self]
         def configure(config_data)
           @no_ssl_verify = config_data['NoSSLVerify'] || []
-          @open_timeout = config_data['OpenTimeout']
-          @read_timeout = config_data['ReadTimeout']
+          @open_timeout = config_data['OpenTimeout'] || 5
+          @read_timeout = config_data['ReadTimeout'] || 5
           @reply_prefix = config_data['ReplyPrefix'] || 'Fetch title: '
           @reply_suffix = config_data['ReplySuffix'] || ''
+          @user_agent = (
+            config_data['UserAgent'] ||
+            "RGRB/%s (Creator's Network IRC bot)"
+          ) % RGRB::VERSION
+
+          self
         end
 
         # 誰かが発言した URL にアクセスし、ページの title タグを取得する
+        #
+        # まず HTTP の HEAD メソッドでコンテントのメタ情報を得る。
+        # (X)HTML ならば title タグの内容を取得する。
+        # それ以外ならばリソースの種類と大きさを取得する。
+        #
+        # @todo 証明書チェックの省略ができるようにする
+        #   （しかし、それは必要な機能か？）。
+        # @todo リダイレクト回数を数え、表示できるようにする。
+        #
         # @param [String] url タイトルを取得するページのURL
-        # @return [String] 取得したタイトル
+        # @return [String] 取得したタイトル、メタ情報
         def fetch_title(url)
-          agent = new_agent(url)
           body =
             begin
-              page = agent.get(url)
-
-              if page.respond_to?(:title)
-                if page.title && !page.title.empty?
-                  page.title
-                else
-                  '(タイトルなし)'
-                end
+              # TODO: リダイレクト回数のカウントで n_redirects を使う
+              headers, n_redirects = http_head(url)
+              content_type = headers[:content_type].split(';').first
+
+              case content_type
+              when 'text/html', 'application/xhtml+xml'
+                title = extract_title(http_get(url))
+                title.empty? ? '(タイトルなし)' : title
               else
-                http_header_to_body(page.response)
+                "(#{extract_content_data(headers).join('; ')})"
               end
-            rescue SocketError
-              '(サーバーに接続できませんでした)'
-            rescue Timeout::Error
-              '(タイムアウト)'
-            rescue Mechanize::ResponseCodeError => response_code_error
-              response_code_to_body(response_code_error.response_code.to_i)
             rescue => e
-              "(#{e.class}: #{e})"
+              fetch_error_to_message(e)
             end
 
           "#{@reply_prefix}#{body}#{@reply_suffix}"
         end
 
-        # HTTP ヘッダをメッセージに変換する
-        # @param [Mechanize::Headers] header HTTP ヘッダ
+        private
+
+        # タイトル取得のエラーメッセージを返す
+        # @param [StandardError] error エラーオブジェクト
         # @return [String]
-        def http_header_to_body(header)
-          contents = [
-            header['Content-Type'].split(';').first
-          ]
+        def fetch_error_to_message(error)
+          body =
+            case error
+            when SocketError
+              'サーバーに接続できませんでした'
+            when Errno::ECONNREFUSED
+              '接続が拒否されました'
+            when RestClient::RequestTimeout, Timeout::Error
+              'タイムアウト'
+            when RestClient::RequestFailed,
+                 RestClient::ExceptionWithResponse
+              http_code = error.http_code
+              reason_phrase =
+                RestClient::STATUSES[http_code] || '不明なエラー'
+
+              "HTTP #{http_code} #{reason_phrase}"
+            when RestClient::SSLCertificateNotVerified
+              "不正なSSL証明書"
+            else
+              error.message
+            end
 
-          if content_length = header['Content-Length']
-            contents << content_length.to_i.to_s(:human_size)
-          end
+          "!! #{body}"
+        end
 
-          "(#{contents.join('; ')})"
+        # HTTP の HEAD メソッドを呼び出す
+        # @param [String] url URL
+        # @param [Integer] n_redirects それまでに生じたリダイレクトの回数
+        # @return [Array<Hash, Integer>] HTTP 200 OK のとき、HTTP
+        #   ヘッダとリダイレクト回数の配列
+        def http_head(url, n_redirects = 0)
+          RestClient::Request.execute(
+            method: :head,
+            url: url,
+            user_agent: @user_agent,
+            open_timeout: @open_timeout,
+            read_timeout: @read_timeout
+          ) do |response, request, result, &block|
+            case response.code
+            when 200
+              [response.headers, n_redirects]
+            else
+              response.return!(request, result, &block)
+            end
+          end
         end
-        private :http_header_to_body
 
-        # HTTP ステータスコードをメッセージに変換する
-        # @param [Fixnum] response_code HTTP ステータスコード
-        # @return [String]
-        def response_code_to_body(response_code)
-          reason_phrase =
-            if HTTP_STATUS_REASON_PHRASE.key?(response_code)
-              HTTP_STATUS_REASON_PHRASE[response_code]
+        # HTTP の GET メソッドを呼び出す
+        # @param [String] url URL
+        # @return [String] HTTP 200 OK のとき、コンテントの内容
+        def http_get(url)
+          RestClient::Request.execute(
+            method: :get,
+            url: url,
+            user_agent: @user_agent,
+            open_timeout: @open_timeout,
+            read_timeout: @read_timeout
+          ) do |response, request, result, &block|
+            case response.code
+            when 200
+              response.body
             else
-              '不明なエラー'
+              response.return!(request, result, &block)
             end
+          end
+        end
 
-          "(HTTP #{response_code} #{reason_phrase})"
+        # HTML からタイトルを抽出する
+        # @param [String] body HTML コード
+        # @return [String] タイトル。存在しない場合は空文字列
+        def extract_title(body)
+          doc = Nokogiri::HTML(body)
+          title_element = doc.at_css('title')
+
+          title_element ? title_element.content : ''
         end
-        private :response_code_to_body
-
-        # 新しいエージェントを作り、設定して返す
-        # @param [String] url タイトル取得元の URL
-        # @return [Mechanize]
-        def new_agent(url)
-          agent = Mechanize.new
-          agent.user_agent =
-            "RGRB/#{RGRB::VERSION} (Creators Network IRC bot)"
-          agent.open_timeout = @open_timeout
-          agent.read_timeout = @read_timeout
-
-          if no_ssl_verify?(url)
-            agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
-          end
 
-          agent
+        # HTTP ヘッダからメタ情報を抽出し、配列に変換する
+        # @param [Hash] headers HTTP ヘッダ
+        # @return [Array]
+        def extract_content_data(headers)
+          content_type =
+            if headers.key?(:content_type)
+              headers[:content_type].split(';').first
+            else
+              nil
+            end
+          content_length =
+            if headers.key?(:content_length)
+              headers[:content_length].to_i.to_s(:human_size)
+            else
+              nil
+            end
+
+          [content_type, content_length].compact
         end
-        private :new_agent
 
         # 与えられた URL が、自己署名証明書を許可するドメインか調べる
         # @param [String] url 調べる URL
@@ -181,7 +184,6 @@ def no_ssl_verify?(url)
 
           hostname.end_with?(*@no_ssl_verify)
         end
-        private :no_ssl_verify?
       end
     end
   end