In [None]:
pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.5.1-py2.py3-none-any.whl (254 kB)
[?25l[K     |█▎                              | 10 kB 16.7 MB/s eta 0:00:01[K     |██▋                             | 20 kB 20.3 MB/s eta 0:00:01[K     |███▉                            | 30 kB 24.1 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 17.5 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 9.3 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 9.8 MB/s eta 0:00:01[K     |█████████                       | 71 kB 8.0 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 8.9 MB/s eta 0:00:01[K     |███████████▋                    | 92 kB 9.7 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 8.6 MB/s eta 0:00:01[K     |██████████████▏                 | 112 kB 8.6 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 8.6 MB/s eta 0:00:01[K     |████████████████▊               | 133 kB 8.6 MB/s eta 0:00:01

In [None]:
import scrapy

class BaikeScrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class GraphNode(scrapy.Item):
    """
    :name neo4j节点名称，用于展示用
    :props neo4j节点各属性键值对
    """
    name = scrapy.Field()
    props = scrapy.Field()

In [None]:
# -*- coding: utf-8 -*-
import re
import os
import codecs
from bs4 import BeautifulSoup
from scrapy import Request
from scrapy.selector import Selector


class BaiduSpiderSpider(scrapy.Spider):
    name = 'baidu_spider'
    allowed_domains = ['baike.baidu.com']
    start_urls = [
        'https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E8%88%AA%E7%A9%BA%E8%88%AA%E5%A4%A9%E5%A4%A7%E5%AD%A6']

    root_path = './data/'
    visited_urls = set()
    url_pattern = 'https://baike.baidu.com{}'


    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.baidu_parse)

    def baidu_parse(self, response):
        # self.save_page_content(response.body) # 保存获取的词条页面信息
        self.parse_page_content(response.body)  # 保存解析的info框内容

        soup = BeautifulSoup(response.body, 'lxml')
        links = soup.find_all('a', href=re.compile('/item/*'))
        for link in links:
            if link not in self.visited_urls and len(self.visited_urls) < 100:
                yield Request(self.url_pattern.format(link["href"]), callback=self.baidu_parse)
                self.visited_urls.add(self.url_pattern.format(link["href"]))
                print('index: %d, visit %s' % (len(self.visited_urls), self.url_pattern.format(link["href"])))

        # yield self.get_baidu_info(response.body) # 解析信息通过pipline存入neo4j存入数据库

    def get_baidu_info(self, content):
        """
        :param content:
        :return:
        :将百度词条页面Info框内信息,词条描述、词条标签解析并放入item
        """
        selector = Selector(text=content)
        title = ''.join(selector.xpath('//h1/text()').extract()).replace('/', '')
        names = selector.xpath('//dt[contains(@class,"basicInfo-item name")]').extract()
        values = selector.xpath('//dd[contains(@class,"basicInfo-item value")]').extract()
        item = GraphNode()
        item['name'] = title
        item['props'] = {}
        for i, name in enumerate(names):
            temp = Selector(text=name).xpath('//dt/text()|//dt/a/text()').extract()
            name = ''.join(temp).replace('\n', '')
            name = ''.join(name.split())
            temp = Selector(text=values[i]).xpath('//dd/text()|//dd/a/text()').extract()
            value = ''.join(temp).replace('\n', '')
            if value is not None:
                item['props'][name] = value

        # 获取词条描述信息
        desc = selector.xpath('//div[@class="lemma-summary"]//div//text()').extract()
        description = re.sub('\[[0-9]+\]', '', ''.join(desc).replace('\n', ''))
        item['props']['词条描述'] = description

        # 获取词条标签
        labels = selector.xpath('//dd[@id="open-tag-item"]//text()').extract()
        label = ''.join(labels).replace('\n', '').replace(' ', '')
        item['props']['词条标签'] = label
        return item

    def parse_page_content(self, content):
        """
        :param content:
        :return:
        :将百度词条页面信息解析并保存为txt文件
        :主要解析三部分信息：1.info框，2.词条描述，3.词条标签
        """
        selector = Selector(text=content)

        # 获取info框信息
        names = selector.xpath('//dt[contains(@class,"basicInfo-item name")]').extract()
        values = selector.xpath('//dd[contains(@class,"basicInfo-item value")]').extract()
        lines = ''
        for i, name in enumerate(names):
            temp = Selector(text=name).xpath('//dt/text()|//dt/a/text()').extract()
            name = ''.join(temp).replace('\n', '')
            name = ''.join(name.split())
            temp = Selector(text=values[i]).xpath('//dd/text()|//dd/a/text()').extract()
            value = ''.join(temp).replace('\n', '')
            if name != '' and value != '':
                lines += name + '$$' + value + '\n'

        # 获取词条描述信息
        desc = selector.xpath('//div[@class="lemma-summary"]//div//text()').extract()
        description = re.sub('\[[0-9]+\]', '', ''.join(desc).replace('\n', ''))
        lines += '词条描述' + '$$' + description + '\n'

        # 获取词条标签
        labels = selector.xpath('//dd[@id="open-tag-item"]//text()').extract()
        label = ''.join(labels).replace('\n', '').replace(' ', '')
        lines += '词条标签' + '$$' + label + '\n'

        # 存储信息
        path = os.path.join(self.root_path, 'baidu_infos')  # 创建文件存放路径
        if not os.path.exists(path):
            os.mkdir(path)
        title = ''.join(selector.xpath('//h1/text()').extract()).replace('/', '')
        f = codecs.open(os.path.join(path, title + '.txt'), 'w', encoding='utf-8')
        f.write(lines)
        f.close()

    def save_page_content(self, content):
        """
        :param content: response.body
        :return: None
        :将爬取的页面内容保存到磁盘
        """
        selector = Selector(text=content)
        title = selector.xpath('//title/text()').extract()[0]  # 获取文件标题
        path = os.path.join(self.root_path, 'baidu_pages')  # 创建文件存放路径
        if not os.path.exists(path):
            os.mkdir(path)
        f = codecs.open(os.path.join(path, title + '.html'), 'w', encoding='utf-8')
        f.write(content.decode('utf-8', errors='ignore'))
        f.close()

In [None]:
pip install log

[31mERROR: Could not find a version that satisfies the requirement log (from versions: none)[0m
[31mERROR: No matching distribution found for log[0m


In [None]:
! ls

code_test  sample_data


In [None]:
from urllib import request
from urllib import parse
from lxml import etree
import re
import json
import codecs
import logging
import time

logger = logging.getLogger(__name__)
pattern = re.compile("[\u4e00-\u9fa50-9]+")


def load_keywords(fp):
	with codecs.open(fp, 'r', encoding='utf-8') as f:
		return list(set(key.strip() for key in f.readlines() if key.strip() != ''))


def extract(url, name):
  url = url + parse.quote(name)
  property = dict()
  property['name'] = name
  req = request.Request(url=url)
  response = request.urlopen(req, timeout=10)
  html = response.read()
  html = etree.HTML(html)
  property_name_list = []
  property_value_list = []
  if html.xpath('//div[@class="basic-info cmn-clearfix"]/dl'):
	  for t in html.xpath('//div[@class="basic-info cmn-clearfix"]/dl'):
		  for dt in t.xpath('./dt[@class="basicInfo-item name"]'):
			  property_name = ''.join(pattern.findall(''.join(dt.xpath('./text()'))))
			  property_name_list.append(property_name)
		  for dd in t.xpath('./dd[@class="basicInfo-item value"]'):
			  property_value = ''.join(dd.xpath('.//text()')).replace("\n", "")
			  property_value_list.append(property_value)
  for i in range(len(property_value_list)):
	  property.setdefault(property_name_list[i], property_value_list[i])
  return property


def main():
	base_url = "http://baike.baidu.com/item/"
	keywords = load_keywords('organ.csv')
	for k in keywords:
		property = extract(base_url, k)
		if property == {}:
			logger.error("Keyword = '{}' not found in 百度百科.".format(k))
			continue
		fpath = 'data/baidu_{}.json'.format(k)
		fp = codecs.open(fpath, 'w', encoding='utf-8')
		json.dump(property, fp)
		fp.close()
		time.sleep(0.5)


if __name__ == '__main__':
	main()

timeout: ignored

In [None]:
import os
from bs4 import BeautifulSoup
import requests
import logging 
import time
import codecs
import json

logger = logging.getLogger(__name__)


def load_keywords(fp):
	with codecs.open(fp, 'r', encoding='utf-8') as f:
		return list(set(key.strip() for key in f.readlines() if key.strip() != ''))


def extract(base_url, keyword):
	property = {}
	property['name'] = keyword

	url = base_url + keyword.strip().replace(' ', '_')
	html = requests.get(url, timeout=15).text
	soup = BeautifulSoup(html, 'lxml')

	table = soup.find('table', class_="infobox")
	try:
		trs = table.find_all('tr')
	except:
		return {}
	for i in trs:
		try:
			tag = i.find('th').getText()
			tag = tag.replace('\n', '')
		except:
			tag = "None"
		try:
			val = i.find('td').getText()
			val = val.replace('\n', '')
		except:
			val = 'None'
		property[tag] = val
	return property


def main():
	en_base_url = 'https://en.wikipedia.org/wiki/'
	keywords = load_keywords('organ.csv')
	count=0
	for k in keywords:
		count=count+1
		fpath = 'data/enwiki_{}.json'.format(k)
		print('{}/{}...'.format(count,len(keywords)))
		if os.path.exists(fpath):
			continue
		property = extract(en_base_url, k)
		if property == {}:
			continue
		logger.info("Keyword = '{}' found in 英文维基百科.".format(k))
		fp = codecs.open(fpath, 'w', encoding='utf-8')
		json.dump(property, fp)
		fp.close()
		time.sleep(2.5)


if __name__ == '__main__':
	main()

110/75140...
111/75140...
112/75140...
113/75140...
114/75140...
115/75140...
116/75140...
117/75140...
118/75140...
119/75140...
120/75140...
121/75140...
122/75140...
123/75140...
124/75140...
125/75140...
126/75140...
127/75140...
128/75140...
129/75140...
130/75140...
131/75140...


KeyboardInterrupt: ignored

In [None]:
import bs4
import requests
import random
    
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def visit(page):
    r = requests.get(page)
    html = bs4.BeautifulSoup(r.content, 'html.parser') #parse page
    title = html.select("#firstHeading")[0].text
    title = title.replace(" ", "_")
    paragraphs = html.select("p")
    links = []
    nono1 = ":"; nono2 = "[update]"; nono3 = "User Contributions"
    for para in paragraphs:
        linx = para.findAll('a')
        try:
            for i in linx:
                if (not hasNumbers("{}".format(i))) and ((nono1 or nono2 or nono3) not in i):
                    i = i.get('title').replace(" ", "_")
                    links.append(i)
        except:
            pass
    return html, title, links

def write(file, line):
    f = open(file, "a")
    f.write(line)
    f.close()

def iterr(pages, start_page, index):
    #TOTAL_PAGES = 6257751
    MAX_PAGES = 10000

    html, title, links = visit(start_page)
    
    write("pages_visited.txt", "{}\n".format(title))
    for i in links:
        write("master.txt", "{} {}\n".format(title,i))

    while index < MAX_PAGES:
        if index % 1000 == 0:
            print("------------index = {}------------".format(index))
        maxx = len(links)
        tries = 0
        while tries <= maxx:
            try: # html failure
                tries += 1
                if maxx>1:
                    j = links[random.randint(0,maxx-1)] # pick random page to go to
                else:
                    j = None
                if j not in pages and j is not None:
                    url = "https://en.wikipedia.org/wiki/{}".format(j)
                    index+=1
                    
                    html, title, links = visit(url)
                    write("pages_visited.txt", "{}\n".format(title))
                    for i in links:
                        write("master.txt", "{} {}\n".format(title,i))
                    pages.append(title) # track redirects instead of shown title
                    print("{}".format(title.replace("_", " ")))
                    
                #elif tries == maxx or maxx == 0: # if all pages have been visited or no links (say, just charts)
                else:
                    url = "https://en.wikipedia.org/wiki/Special:Random"
                    index+=1
                    
                    html, title, links = visit(url)
                    for i in links:
                        write("master.txt", "{} {}\n".format(title,i))
                    pages.append(title)
                    print("-{}".format(title.replace("_", " ")))
            except:
                pass
    print("finished")

pages = [] # pages visited
try:
    fe = open("pages_visited.txt", "r")
    for line in fe:
        titlee = line.split()[0]
        pages.append(titlee)
    fe.close()
except:
    pass
index = len(pages)
print("visited {} pages total".format(index))
iterr(pages, "https://en.wikipedia.org/wiki/{}".format(pages[1]), index)

visited 0 pages total


IndexError: ignored

In [None]:
import bs4
import requests
import random
#import os

# try: # start from fresh file
#     os.remove("master.txt")
# except:
#     pass
    
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def recur(pages, start_page, index):
    #TOTAL_PAGES = 6257751

    r = requests.get(start_page) # pull page
    html = bs4.BeautifulSoup(r.content, 'html.parser') #parse page
    title = html.select("#firstHeading")[0].text
    
    links = [] # links on page
    paragraphs = html.select("p")
    for para in paragraphs:
        linx = para.findAll('a')
        try:
            for i in linx:
                if not hasNumbers("{}".format(i)) and i != "[update]":
                    links.append(i.get('title').replace(" ", "_"))
                    f = open("master.txt", "a")
                    f.write("{} {}\n".format(title.replace(" ", "_"), i.get('title').replace(" ", "_")))
                    f.close()
        except:
            pass
        
    #if index != TOTAL_PAGES: # if visited everything (SHOULD BE A BIG SLOWDOWN MECHANIC INSTEAD)
    sentinel = 0
    while sentinel == 0:
        maxx = len(links)
        tries = 0
        
        j = links[random.randint(0,len(links))] # pick random page to go to (cant do more than one at a time apparently)
        try:
            tries += 1
            if j not in pages:
                url = "https://en.wikipedia.org/wiki/{}".format(j)
                print("{}".format(j))
                pages.append(j) # track redirects instead of page titles
                index+=1
                recur( pages, url, index)    
                sentinel = 1 # just in case you can
            elif tries == maxx or maxx == 0: # if all pages have been visited or no links (say, just charts)
                url = "https://en.wikipedia.org/wiki/Special:Random"
                print("------------visiting random page------------")
                index+=1
                recur( pages, url, index)
        except:
            pass

pages = ["Philosophy"] # pages visited
try:
    fe = open("master.txt", "r")
    for line in fe:
        titlee = line.split()[0]
        if titlee not in pages:
            pages.append(titlee)
    fe.close()
except:
    pass
print(pages)
index = 1;
recur(pages, "https://en.wikipedia.org/wiki/Philosophy", index)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
  if 'href' in link.attrs:
    print(link.attrs['href'])

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
      print(link.attrs['href'])

### 返回更加精细化的处理
- 一个函数getLinks，可以用一个 /wiki/<词条名称>形式的词条URL作为参数，然后以同样的形式返回一个列表，里面包含所有的词条URL；
- 一个主函数，以某个起始词条为参数调用getLinks，然后从返回的URL列表里随机选择一个词条链接，再次调用getLinks，直到你主动停止程序，或者在新的页面上没有词条链接。

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
  html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
  bs = BeautifulSoup(html, 'html.parser')
  return bs.find('div', {'id':'bodyContent'}).find_all('a',
      href=re.compile('^(/wiki/)((?!:).)*$'))
  
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
  newArticle = links[random.randint(0, len(links)-1)].attrs['href']
  print(newArticle)
  links = getLinks(newArticle)

## 收集整个网站的数据

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
  global pages
  html = urlopen('https://zh.wikipedia.org/{}'.format(pageUrl))
  bs = BeautifulSoup(html, 'html.parser')
  try:
    print(bs.h1.get_text())
    print(bs.find(id = 'mw-content-text').find_all('p')[0])
    print(bs.find(id = 'ca-edit').find('span').find('a').attrs['href'])
  except AttributeError:
    print('页面缺少一些属性，请不要担心！')
  for link in bs.find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
      if link.attrs['href'] not in pages:
        # 新页面
        newPage = link.attrs['href']
        print('-'*20)
        print(newPage)
        pages.add(newPage)
        getLinks(newPage)

getLinks('')

维基百科:首页
<p><span id="no-edit-0"></span>
<style data-mw-deduplicate="TemplateStyles:r64506932">.mw-parser-output .new{color:inherit}.mw-parser-output #column-feature-more .column-feature-more-header a{font-weight:bold;color:#474747}.mw-parser-output #column-feature-more{margin-top:1.2em;clear:left}.mw-parser-output #column-feature-more ul{list-style:none;margin-left:0}.mw-parser-output #column-feature-more li{font-size:.9em;color:#474747}.mw-parser-output .mp-2012-block-nav-footer{color:#666;font-size:.85em;clear:both;text-align:right}.mw-parser-output #mp-2012-banner a{color:#17b}.mw-parser-output #mp-2012-banner>div{line-height:120px}.mw-parser-output #mp-2012-banner>div>div{line-height:1.5em;vertical-align:middle}.mw-parser-output #mp-2012-banner-title{float:left;margin-left:40px;width:240px}.mw-parser-output #mp-2012-banner-title>div{display:inline-block;font-family:sans-serif}.mw-parser-output #mp-2012-banner-title h1{font-size:2em;margin:0;padding:0;line-height:1.6em}.mw-parser-ou

KeyboardInterrupt: ignored

### 整合函数，实现不同类型的网络爬虫

In [14]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

# 获取页面中所有内链的列表
def getInternalLinks(bs, includeUrl):
  includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
        urlparse(includeUrl).netloc)
  internalLinks = []
  # 找出所有以“/"开头的链接
  for link in bs.find_all('a', href=re.compile('^(/|.*' + includeUrl + ')')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in internalLinks:
        if(link.attrs['href'].startswith('/')):
          internalLinks.append(includeUrl+link.attrs['href'])
        else:
          internalLinks.append(link.attrs['href'])
  return internalLinks


# 获取页面中所有外链的列表
def getExternalLinks(bs, excludeUrl):
  externalLinks = []
  # 找出所有以'http'或'www'开头且不包含当前URL的链接
  for link in bs.find_all('a',
    href=re.compile('^(http|www)((?! ' + excludeUrl + ').) *$')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in externalLinks:
        externalLinks.append(link.attrs['href'])
  return externalLinks


# 收集在网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
  html = urlopen(siteUrl)
  domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                            urlparse(siteUrl).netloc)
  bs = BeautifulSoup(html, 'html.parser')
  internalLinks = getInternalLinks(bs, domain)
  externalLinks = getExternalLinks(bs, domain)

  for link in externalLinks:
    if link not in AllExtLinks:
      allExtLinks.add(link)
      print(link)
  
  for link in internalLinks:
    if link not in allIntLinks:
      allIntLinks.add(link)
      getAllExternalLinks(link)




def getRandomExternalLink(startingPage):
  html = urlopen(startingPage)
  bs = BeautifulSoup(html, 'html.parser')
  externalLinks = getExternalLinks(bs,
        urlparse(startingPage).netloc)
  if len(externalLinks) == 0:
    print('No external links, looking around the site for one')
    domain = '{}://{}'.format(urlparse(startingPage).scheme,
        urlparse(startingPage).netloc)
    internalLinks = getInternalLinks(bs, domain)
    return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
  else:
    return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite)                               :
  externalLinks = getRandomExternalLink(startingSite)
  print('Random external link is :{}'.format(externalLink))
  followExternalOnly(externalLink)

# followExternalOnly('http://oreilly.com')
allIntLinks.add('https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E6%96%87%E5%8C%96')
# getAllExternalLinks('https://zh.wikipedia.org/wiki/Wikipedia:%E9%A6%96%E9%A1%B5')

In [15]:
print(allIntLinks)

{'https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E6%96%87%E5%8C%96'}


## 改写代码，根据所需的爬取信息对代码进行融合

In [None]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())


pages = set()
def getLinks(pageUrl):
  global pages
  html = urlopen('https://zh.wikipedia.org/{}'.format(pageUrl))
  bs = BeautifulSoup(html, 'html.parser')
  try:
    print(bs.h1.get_text())
    print(bs.find(id = 'mw-content-text').find_all('p')[0])
    print(bs.find(id = 'ca-edit').find('span').find('a').attrs['href'])
  except AttributeError:
    print('页面缺少一些属性，请不要担心！')
  for link in bs.find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
      if link.attrs['href'] not in pages:
        # 新页面
        newPage = link.attrs['href']
        print('-'*20)
        print(newPage)
        pages.add(newPage)
        getLinks(newPage)

getLinks('')

# 获取页面中所有内链的列表
def getInternalLinks(bs, includeUrl):
  includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
        urlparse(includeUrl).netloc)
  internalLinks = []
  # 找出所有以“/"开头的链接
  for link in bs.find_all('a', href=re.compile('^(/|.*' + includeUrl + ')')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in internalLinks:
        if(link.attrs['href'].startswith('/')):
          internalLinks.append(includeUrl+link.attrs['href'])
        else:
          internalLinks.append(link.attrs['href'])
  return internalLinks


# 获取页面中所有外链的列表
def getExternalLinks(bs, excludeUrl):
  externalLinks = []
  # 找出所有以'http'或'www'开头且不包含当前URL的链接
  for link in bs.find_all('a',
    href=re.compile('^(http|www)((?! ' + excludeUrl + ').) *$')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in externalLinks:
        externalLinks.append(link.attrs['href'])
  return externalLinks


# 收集在网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
  html = urlopen(siteUrl)
  domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                            urlparse(siteUrl).netloc)
  bs = BeautifulSoup(html, 'html.parser')
  internalLinks = getInternalLinks(bs, domain)
  externalLinks = getExternalLinks(bs, domain)

  for link in externalLinks:
    if link not in AllExtLinks:
      allExtLinks.add(link)
      print(link)
  
  for link in internalLinks:
    if link not in allIntLinks:
      allIntLinks.add(link)
      getAllExternalLinks(link)


def getRandomExternalLink(startingPage):
  html = urlopen(startingPage)
  bs = BeautifulSoup(html, 'html.parser')
  externalLinks = getExternalLinks(bs,
        urlparse(startingPage).netloc)
  if len(externalLinks) == 0:
    print('No external links, looking around the site for one')
    domain = '{}://{}'.format(urlparse(startingPage).scheme,
        urlparse(startingPage).netloc)
    internalLinks = getInternalLinks(bs, domain)
    return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
  else:
    return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite)                               :
  externalLinks = getRandomExternalLink(startingSite)
  print('Random external link is :{}'.format(externalLink))
  followExternalOnly(externalLink)

# followExternalOnly('http://oreilly.com')
allIntLinks.add('https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E6%96%87%E5%8C%96')
# getAllExternalLinks('https://zh.wikipedia.org/wiki/Wikipedia:%E9%A6%96%E9%A1%B5')