forked from sparklemotion/mechanize
/
wikipedia_links_to_philosophy.rb
159 lines (113 loc) · 3.12 KB
/
wikipedia_links_to_philosophy.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
require 'mechanize'
require 'tsort'
##
# This example implements the alt-text of http://xkcd.com/903/ which states:
#
# Wikipedia trivia: if you take any article, click on the first link in the
# article text not in parentheses or italics, and then repeat, you will
# eventually end up at "Philosophy".
class WikipediaLinksToPhilosophy
def initialize
@agent = Mechanize.new
@agent.user_agent_alias = 'Mac Safari' # Wikipedia blocks "mechanize"
@history = @agent.history
@wiki_url = URI 'http://en.wikipedia.org'
@search_url = @wiki_url + '/w/index.php'
@random_url = @wiki_url + '/wiki/Special:Random'
@title = nil
@seen = nil
end
##
# Retrieves the title of the current page
def extract_title
@page.title =~ /(.*) - Wikipedia/
@title = $1
end
##
# Retrieves the initial page. If +query+ is not given a random page is
# chosen
def fetch_first_page query
if query then
search query
else
random
end
end
##
# The search is finished if we've seen the page before or we've reached
# Philosophy
def finished?
@seen or @title == 'Philosophy'
end
##
# Follows the first non-parenthetical, non-italic link in the main body of
# the article.
def follow_first_link
puts @title
# > p > a rejects italics
links = @page.root.css('.mw-content-ltr > p > a[href^="/wiki/"]')
# reject disambiguation and special pages, images and files
links = links.reject do |link_node|
link_node['href'] =~ %r%/wiki/\w+:|\(disambiguation\)%
end
links = links.reject do |link_node|
in_parenthetical? link_node
end
link = links.first
unless link then
# disambiguation page? try the first item in the list
link =
@page.root.css('.mw-content-ltr > ul > li > a[href^="/wiki/"]').first
end
# convert a Nokogiri HTML element back to a mechanize link
link = Mechanize::Page::Link.new link, @agent, @page
return if @seen = @agent.visited?(link)
@page = link.click
extract_title
end
##
# Is +link_node+ in an open parenthetical section?
def in_parenthetical? link_node
siblings = link_node.parent.children
seen = false
before = siblings.reject do |node|
seen or (seen = node == link_node)
end
preceding_text = before.map { |node| node.text }.join
open = preceding_text.count '('
close = preceding_text.count ')'
open > close
end
##
# Prints the result of the search
def print_result
if @seen then
puts "[Loop detected]"
else
puts @title
end
puts
# subtract initial search or Special:Random
puts "After #{@agent.history.length - 1} pages"
end
##
# Retrieves a random page from wikipedia
def random
@page = @agent.get @random_url
extract_title
end
##
# Entry point
def run query = nil
fetch_first_page query
follow_first_link until finished?
print_result
end
##
# Searches for +query+ on wikipedia
def search query
@page = @agent.get @search_url, search: query
extract_title
end
end
WikipediaLinksToPhilosophy.new.run ARGV.shift if $0 == __FILE__