Skip to content
This repository
file 94 lines (72 sloc) 2.579 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
# Description:
# None
#
# Dependencies:
# "htmlparser": "1.7.6"
# "soupselect": "0.2.0"
# "underscore": "1.3.3"
# "underscore.string": "2.3.0"
#
# Configuration:
# None
#
# Commands:
# hubot wiki me <query> - Searches for <query> on Wikipedia.
#
# Author:
# h3h

_ = require("underscore")
_s = require("underscore.string")
Select = require("soupselect").select
HTMLParser = require "htmlparser"

module.exports = (robot) ->
  robot.respond /(wiki)( me)? (.*)/i, (msg) ->
    wikiMe robot, msg.match[3], (text, url) ->
      msg.send text
      msg.send url if url

wikiMe = (robot, query, cb) ->
  articleURL = makeArticleURL(makeTitleFromQuery(query))

  robot.http(articleURL)
    .header('User-Agent', 'Hubot Wikipedia Script')
    .get() (err, res, body) ->
      return cb "Sorry, the tubes are broken." if err

      if res.statusCode is 301
        return cb res.headers.location

      if /does not have an article/.test body
        return cb "Wikipedia has no idea what you're talking about."

      paragraphs = parseHTML(body, "p")

      bodyText = findBestParagraph(paragraphs) or "Have a look for yourself:"
      cb bodyText, articleURL

# Utility Methods

childrenOfType = (root, nodeType) ->
  return [root] if root?.type is nodeType

  if root?.children?.length > 0
    return (childrenOfType(child, nodeType) for child in root.children)

  []

findBestParagraph = (paragraphs) ->
  return null if paragraphs.length is 0

  childs = _.flatten childrenOfType(paragraphs[0], 'text')
  text = (textNode.data for textNode in childs).join ''

  # remove parentheticals (even nested ones)
  text = text.replace(/\s*\([^()]*?\)/g, '').replace(/\s*\([^()]*?\)/g, '')
  text = text.replace(/\s{2,}/g, ' ') # squash whitespace
  text = text.replace(/\[[\d\s]+\]/g, '') # remove citations
  text = _s.unescapeHTML(text) # get rid of nasties

  # if non-letters are the majority in the paragraph, skip it
  if text.replace(/[^a-zA-Z]/g, '').length < 35
    findBestParagraph(paragraphs.slice(1))
  else
    text

makeArticleURL = (title) ->
  "https://en.wikipedia.org/wiki/#{encodeURIComponent(title)}"

makeTitleFromQuery = (query) ->
  strCapitalize(_s.trim(query).replace(/[ ]/g, '_'))

parseHTML = (html, selector) ->
  handler = new HTMLParser.DefaultHandler((() ->),
    ignoreWhitespace: true
  )
  parser = new HTMLParser.Parser handler
  parser.parseComplete html

  Select handler.dom, selector

strCapitalize = (str) ->
  return str.charAt(0).toUpperCase() + str.substring(1);
Something went wrong with that request. Please try again.