In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Wikipedia:Protection_policy#semi
#mw-head
#searchInput
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
#cite_note-1
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
http://baconbros.com/
#cite_note-2
#cite_note-actor-3
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/X-Men:_First_Class
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/w

## Retrieving Articles Only

In [2]:
from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy
/wiki/The_Guardian
/wi

## Random Walk

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Access_Hollywood
/wiki/TV_One_(American_TV_channel)
/wiki/WWIN_(AM)
/wiki/WPRS-FM
/wiki/WFRE
/wiki/WFRB-FM
/wiki/FM_broadcasting
/wiki/Band_I
/wiki/Band_VI
/wiki/Band_I
/wiki/DVB-T2
/wiki/Mobile_broadband
/wiki/ISSN_(identifier)
/wiki/ISO_31-4
/wiki/ISO/TR_11941
/wiki/ISO_5775
/wiki/ISO_2711
/wiki/Quality_function_deployment
/wiki/ISLISP
/wiki/OpenLisp
/wiki/Genera_(operating_system)
/wiki/MIT_Computer_Science_and_Artificial_Intelligence_Laboratory
/wiki/Adversarial_machine_learning
/wiki/Deep_learning
/wiki/Cluster_analysis
/wiki/Regression_analysis
/wiki/Linear_combination
/wiki/Multilinear_algebra
/wiki/Geodesic
/wiki/Cauchy_stress_tensor
/wiki/Tensor_contraction
/wiki/Raising_and_lowering_indices#An_example_from_Minkowski_spacetime
/wiki/Vector_(mathematics_and_physics)
/wiki/Rotation_vector
/wiki/Thomas_Curtright
/wiki/MGP_(identifier)
/wiki/Alma_mater
/wiki/Oxford_Latin_Dictionary
/wiki/Alexander_Souter
/wiki/Regius_Professor_of_Humanity
/wiki/John_Behr
/wiki/Gregory_of_Naz

KeyboardInterrupt: 

## Recursively crawling an entire site

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Protection_policy#extended
/wiki/Wikipedia:Lists_of_protected_pages
/wiki/Wikipedia:Protection_policy
/wiki/Wikipedia:Perennial_proposals
/wiki/Wikipedia:Reliable_sources/Perennial_sources
/wiki/Wikipedia:Reliable_sources
/wiki/Wikipedia:WikiProject_Reliability
/wiki/Wikipedia:WRE
/wiki/File:People_icon.svg
/wiki/Special:WhatLinksHere/File:People_icon.svg
/wiki/Help:What_links_here
/wiki/Wikipedia:Project_namespace#How-to_and_information_pages
/wiki/Wikipedia:Protection_policy#move
/wiki/Wikipedia:WikiProject_Parliamentary_Procedure
/wiki/File:People_icon_dead.svg
/wiki/User:StevenDH
/wiki/Wikipedia:User_pages
/wiki/Wikipedia:FUW
/wiki/Wikipedia:Protection_policy#template
/wiki/Wikipedia:Party_and_person
/wiki/File:Essay.svg
/wiki/File:Essay.png
/wiki/File:To_Commons.svg
/wiki/Special:WhatLinksHere/File:To_Commons.svg
/wiki/User_ta

KeyboardInterrupt: 

## Collecting Data Across an Entire Site

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

Main Page
<p><b><a href="/wiki/Scorpion" title="Scorpion">Scorpions</a></b> are <a href="/wiki/Predation" title="Predation">predatory</a> <a href="/wiki/Arachnid" title="Arachnid">arachnids</a> of the <a href="/wiki/Order_(biology)" title="Order (biology)">order</a> Scorpiones. They have eight legs, a pair of <a href="/wiki/Chela_(organ)" title="Chela (organ)">grasping pincers</a> and a narrow, segmented tail, often carried in a characteristic forward curve over the back and always ending with a <a href="/wiki/Stinger" title="Stinger">stinger</a>. There are over 2,500 described <a href="/wiki/Species" title="Species">species</a>. They mainly live in <a href="/wiki/Desert" title="Desert">deserts</a> but have adapted to a wide range of environments. Most species <a href="/wiki/Viviparity" title="Viviparity">give birth to live young</a>, and the female cares for the juveniles while their <a href="/wiki/Exoskeleton" title="Exoskeleton">exoskeletons</a> harden, transporting them on her back

Help:What links here
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Project_namespace#How-to_and_information_pages
Wikipedia:Project namespace
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Protection_policy#move
Wikipedia:Protection policy
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:WikiProject_Parliamentary_Procedure
Wikipedia:WikiProject Parliamentary Procedure
<p><b>WikiProject Parliamentary Procedure</b> is devoted to improving the quality and comprehensiveness of articles on topics related to [[parliamentary proyOpen main menu
Wikipedia's 
M
Search
Wikipedia:WikiProject Parliamentary Procedure
Language's 
Watch
Edit
This WikiProject is believed to be inactive.
If you are not currently a member of the project, you may still want to consider joining it. This status should be changed if any 

List of ISO 639-1 codes
<p>ISO 639 is a standardized nomenclature used to classify languages. Each language is assigned a two-letter (639-1) and three-letter (<span class="nowrap">639-2</span> and <span class="nowrap">639-3</span>) lowercase abbreviation, amended in later versions of the nomenclature.
</p>
This page is missing something! Continuing.
--------------------
/wiki/File:Question_book-new.svg
File:Question book-new.svg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/en/9/99/Question_book-new.svg" title="Question book-new.svg">Original file</a> ‎<span class="fileInfo">(SVG file, nominally 512 × 399 pixels, file size: 6 KB)</span>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Scalable_Vector_Graphics
Scalable Vector Graphics
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/SVG_(disambiguation)
SVG (disambiguation)
<p><b><a class="mw-redirect" href="/wiki/SVG" title="SVG">SVG</a></b> 

Wikipedia:Shortcut
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Keyboard_shortcuts
Wikipedia:Keyboard shortcuts
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:WikiProject_Kansas
Wikipedia:WikiProject Kansas
<p><span style="font-size:100%;font-weight:bold;border: none; margin: 0; padding:0; padding-bottom:.1em; color:#FFD700;"><a class="image" href="/wiki/File:Seal_of_Kansas.svg"><img alt="Seal of Kansas.svg" data-file-height="600" data-file-width="600" decoding="async" height="48" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/48px-Seal_of_Kansas.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/72px-Seal_of_Kansas.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/96px-Seal_of_Kansas.svg.png 2x" width="48"/></a><br/><i>Welcome</i></span>
</p>
This page is mi

File:User-info.svg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/en/c/ce/User-info.svg" title="User-info.svg">Original file</a> ‎<span class="fileInfo">(SVG file, nominally 48 × 48 pixels, file size: 20 KB)</span>
</p>
This page is missing something! Continuing.
--------------------
/wiki/File:Full-protection-shackle.svg
File:Full-protection-shackle.svg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/en/4/44/Full-protection-shackle.svg" title="Full-protection-shackle.svg">Original file</a> ‎<span class="fileInfo">(SVG file, nominally 512 × 512 pixels, file size: 729 bytes)</span>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Criteria_for_speedy_deletion#F8
Wikipedia:Criteria for speedy deletion
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Revision_deletion
Wikipedia:Revision deletion
<p class="mw-empty-elt">
</p>
This page is missing something! Continu

Wikipedia:WikiProject Politics
<p>Welcome to <b>WikiProject Politics</b>. A <a href="/wiki/Wikipedia:WikiProject" title="Wikipedia:WikiProject">WikiProject</a> is a group of pages in the "Wikipedia" <a href="/wiki/Wikipedia:Namespace" title="Wikipedia:Namespace">article namespace</a> that are devoted to the management of a specific topic or family of topics within Wikipedia; and, simultaneously, a <a href="/wiki/Wikipedia:WikiProject_Politics/Participants" title="Wikipedia:WikiProject Politics/Participants">group of editors</a> who use those pages to collaborate on encyclopedic work. It is not a place to write encyclopedia articles directly, but a resource to help coordinate and organize the writing and editing of those articles. This group focuses on articles in the broad field of <a href="/wiki/Politics" title="Politics">Politics</a>.  This page and its subpages contain their suggestions; it is hoped that this project will help to focus the efforts of other Wikipedians. You may sign 

Wikipedia:WikiProject Vital Articles
<p>Welcome to Wikiproject Vital Articles! The goal of this <a href="/wiki/Wikipedia:WikiProject" title="Wikipedia:WikiProject">project</a> is to better organize <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia's</a> "<a class="mw-redirect" href="/wiki/Wikipedia:VA" title="Wikipedia:VA">vital articles</a>." This page and its subpages can be used to focus our efforts to better cover this critical area of the project. Many of Wikipedia's most-visited pages are in need of much cleanup and maintenance. It is essential to keep these articles up-to-date. If you would like to help, please inquire on the <a href="/wiki/Wikipedia_talk:WikiProject_Vital_Articles" title="Wikipedia talk:WikiProject Vital Articles">talk page</a> and see the goals and tasks below.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:WikiProject_Council/Directory
Wikipedia:WikiProject Council/Directory
<p>WikiProject directory
</p>
This page is

Help:Editing
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Editing_policy
Wikipedia:Editing policy
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:List_of_guidelines#Editing
Wikipedia:List of guidelines
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:GUIDELINE
Wikipedia:Policies and guidelines
<p>Wikipedia's <b>policies and guidelines</b> are developed by the community to describe best practices, clarify principles, resolve conflicts, and otherwise further our goal of creating a free, reliable encyclopedia. There is no need to read any policy or guideline pages to start editing. The <a href="/wiki/Wikipedia:Five_pillars" title="Wikipedia:Five pillars">five pillars</a> are a popular summary of the most pertinent principles.
</p>
This page is missing something! Continuing.
--------------------
/wiki/

Nation
<p>A <b>nation</b> is a <a href="/wiki/Community" title="Community">community</a> of people formed on the basis of a common <a href="/wiki/Language" title="Language">language</a>, history, <a class="mw-redirect" href="/wiki/Ethnicity" title="Ethnicity">ethnicity</a>, or a common <a href="/wiki/Culture" title="Culture">culture</a>, and, in many cases, a shared territory. A nation is more overtly political than an <a href="/wiki/Ethnic_group" title="Ethnic group">ethnic group</a>;<sup class="reference" id="cite_ref-black_1-0"><a href="#cite_note-black-1">[1]</a></sup><sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup> it has been described as "a fully mobilized or institutionalized ethnic group".<sup class="reference" id="cite_ref-FOOTNOTEEller1997_3-0"><a href="#cite_note-FOOTNOTEEller1997-3">[3]</a></sup> Some nations are equated with ethnic groups (see <a href="/wiki/Ethnic_nationalism" title="Ethnic nationalism">ethnic nationalism</a> and <a href="/wiki

Wikipedia:WikiProject Norway
<p><i>You might have been looking for <a href="/wiki/Wikipedia:What_Wikipedia_is_not" title="Wikipedia:What Wikipedia is not">Wikipedia:What Wikipedia is not</a>, <a href="/wiki/Wikipedia:Notability" title="Wikipedia:Notability">Wikipedia:Notability</a> or <a href="/wiki/Wikipedia:WikiProject_New_Orleans" title="Wikipedia:WikiProject New Orleans">Wikipedia:WikiProject New Orleans</a>.</i>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Notability
Wikipedia:Notability
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:NPOV
Wikipedia:Neutral point of view
<p>All encyclopedic content on <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a> must be written from a <b>neutral point of view</b> (<b>NPOV</b>), which means representing fairly, proportionately, and, as far as possible, without editorial bias, all the significant <a href="/wiki/Point_of_view_(philosop

Wikipedia talk:Did you know
<p><br/>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Main_Page/Errors
Wikipedia:Main Page/Errors
<p><span id="coordinates"><a href="/wiki/Wikipedia:Main_Page/Errors/Administrator_instructions" title="Wikipedia:Main Page/Errors/Administrator instructions">Administrator instructions</a></span></p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Main_Page
Main Page
<p><b><a href="/wiki/Scorpion" title="Scorpion">Scorpions</a></b> are <a href="/wiki/Predation" title="Predation">predatory</a> <a href="/wiki/Arachnid" title="Arachnid">arachnids</a> of the <a href="/wiki/Order_(biology)" title="Order (biology)">order</a> Scorpiones. They have eight legs, a pair of <a href="/wiki/Chela_(organ)" title="Chela (organ)">grasping pincers</a> and a narrow, segmented tail, often carried in a characteristic forward curve over the back and always ending with a <a href="/wiki/Stinger" title="Stinger">s

Real Life
<p><b>Real Life</b> may refer to:
</p>
This page is missing something! Continuing.
--------------------
/wiki/Real_Life_(1979_film)
Real Life (1979 film)
<p><i><b>Real Life</b></i> is a 1979 American <a href="/wiki/Comedy_film" title="Comedy film">comedy film</a> starring <a href="/wiki/Albert_Brooks" title="Albert Brooks">Albert Brooks</a> (in his <a class="mw-redirect" href="/wiki/Directorial_debut" title="Directorial debut">directorial debut</a>), who also co-authored the screenplay. It is a <a href="/wiki/Parody" title="Parody">spoof</a> of the 1973 <a href="/wiki/Reality_television" title="Reality television">reality television</a> program <i><a href="/wiki/An_American_Family" title="An American Family">An American Family</a></i> and portrays a documentary filmmaker named Albert Brooks who attempts to live with and film a <a href="/wiki/Dysfunctional_family" title="Dysfunctional family">dysfunctional family</a> for one full year.
</p>
This page is missing something! Cont

Category:Wikipedia naming conventions
<p><b>Category:Wikipedia naming conventions</b> is for pages that have received general consensus as English Wikipedia  guidelines to supplement and explain the <a href="/wiki/Wikipedia:Article_titles" title="Wikipedia:Article titles">Wikipedia:Article titles</a> policy.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Category:Naming_conventions
Category:Naming conventions
<p>This category has the following 12 subcategories, out of 12 total.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Help:Categories
Help:Categories
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:FAQ/Readers
Wikipedia:FAQ/Readers
<p><b>FAQs (Frequently Asked Questions)</b>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:FAQ
Wikipedia:FAQ
<p><b>FAQs (Frequently Asked Questions)</b>
</p>
This page is missing something! Contin

Wikipedia:Policies and guidelines
<p>Wikipedia's <b>policies and guidelines</b> are developed by the community to describe best practices, clarify principles, resolve conflicts, and otherwise further our goal of creating a free, reliable encyclopedia. There is no need to read any policy or guideline pages to start editing. The <a href="/wiki/Wikipedia:Five_pillars" title="Wikipedia:Five pillars">five pillars</a> are a popular summary of the most pertinent principles.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:Principles
Wikipedia:Principles
<p>All of these pages are intended to be informative and helpful to editors. None of these pages themselves are official policies or guidelines of the English Wikipedia community (see <a class="mw-redirect" href="/wiki/Wikipedia:Local_consensus" title="Wikipedia:Local consensus">WP: local consensus</a> for details).
</p>
This page is missing something! Continuing.
--------------------
/wiki/English_Wikipedi

Category:Wikipedia content guidelines
<p><br/></p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:List_of_guidelines#Content
Wikipedia:List of guidelines
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:List_of_policies
Wikipedia:List of policies
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia_talk:List_of_policies
Wikipedia talk:List of policies and guidelines
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/File:Peacedove.svg
File:Peacedove.svg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/commons/6/6c/Peacedove.svg" title="Peacedove.svg">Original file</a> ‎<span class="fileInfo">(SVG file, nominally 714 × 729 pixels, file size: 24 KB)</span>
</p>
This page is missing something! Continuing.
--------------------
/wiki/Special:WhatLinksHere/File:Peacedove.svg
Pag

Style guide
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/List_of_fashion_magazines
List of fashion magazines
<p>This is a <b>list of <a href="/wiki/Notability_in_the_English_Wikipedia" title="Notability in the English Wikipedia">notable</a> fashion magazines</b>.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Notability_in_the_English_Wikipedia
Notability in the English Wikipedia
<p>In the <a href="/wiki/English_Wikipedia" title="English Wikipedia">English version</a> of the online encyclopedia <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a>, <a href="/wiki/Notability" title="Notability">notability</a> is a criterion to determine whether a topic merits a separate Wikipedia article. It is described in the guideline "<a href="/wiki/Wikipedia:Notability" title="Wikipedia:Notability">Wikipedia:Notability</a>". In general, notability is an attempt to assess whether the topic has "gained sufficient

Almanac (disambiguation)
<p>An <b><a href="/wiki/Almanac" title="Almanac">almanac</a></b> is an annual publication containing tabular information in a particular field or fields often arranged according to the calendar.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Nautical_almanac
Nautical almanac
<p>A <b>nautical almanac</b> is a publication describing the positions of a selection of <a class="mw-redirect" href="/wiki/Celestial_bodies" title="Celestial bodies">celestial bodies</a> for the purpose of enabling navigators to use <a href="/wiki/Celestial_navigation" title="Celestial navigation">celestial navigation</a> to determine the position of their ship while at sea. The Almanac specifies for each whole hour of the year the position on the Earth's surface (in <a href="/wiki/Declination" title="Declination">declination</a> and <a class="mw-redirect" href="/wiki/Greenwich_meridian" title="Greenwich meridian">Greenwich</a> <a href="/wiki/Hour_angle" title=

Periodicity
<p><b>Periodicity</b> or <b>periodic</b> may refer to:
</p>
This page is missing something! Continuing.
--------------------
/wiki/Bott_periodicity_theorem
Bott periodicity theorem
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Mathematics
Mathematics
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Mathematics_(disambiguation)
Mathematics (disambiguation)
<p><b><a href="/wiki/Mathematics" title="Mathematics">Mathematics</a></b> is a field of knowledge.
</p>
This page is missing something! Continuing.
--------------------
/wiki/Mathematics_(album)
Mathematics (album)
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/Help:Referencing_for_beginners
Help:Referencing for beginners
<p class="mw-empty-elt">
</p>
This page is missing something! Continuing.
--------------------
/wiki/File:Webcomic_xkcd_-_Wikipedian_protester.pn

Free software
<p><b>Free software</b> (or <b>libre software</b>)<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup><sup class="reference" id="cite_ref-:0_2-0"><a href="#cite_note-:0-2">[2]</a></sup> is <a href="/wiki/Software" title="Software">computer software</a> distributed under terms that allow users to run the software for any purpose as well as to study, change, and distribute it and any adapted versions.<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup><sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup><sup class="reference" id="cite_ref-def_5-0"><a href="#cite_note-def-5">[5]</a></sup><sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[6]</a></sup> Free software is a matter of <a href="/wiki/Liberty" title="Liberty">liberty</a>, not price; all users are legally free to do what they want with their copies of a free software (including profiting from them) regardless of how much is paid to obtain

Wikipedia talk:WikiProject Libraries/Members
<p>I wanted to add my name to the list, and so doing, found Ludvikus' request to organize the list alphabetically. In so doing, I decided to go the extra mile and made a couple of changes: 
</p>
This page is missing something! Continuing.
--------------------
/wiki/Wikipedia:WikiProject_Libraries/Resources
Wikipedia:WikiProject Libraries/Resources


IndexError: list index out of range

## Crawling across the Internet

In [6]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
#페이지에서 발견된 내부 링크를 모두 목록으로 만듭니다.
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    # / 로 시작하는 링크를 모두 찾습니다.
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
#페이지에서 발견된 외부 링크를 모두 목록으로 만듭니다.
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    #현재 URL을 포함하지 않으면서 http나 www로 시작하는 링크를 모두 찾습니다.
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)
            
followExternalOnly('http://oreilly.com')

Random external link is: https://www.amazon.com/OReilly-Media-Inc/dp/B087YYHL5C/ref=sr_1_2?dchild=1&keywords=oreilly&qid=1604964116&s=mobile-apps&sr=1-2


HTTPError: HTTP Error 503: Service Unavailable

## Collect all External Links from a Site

In [7]:
#사이트에서 찾은 외부 URL을 모두 리스트로 수집합니다.
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print('11111>>link')
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            print('22222>>link')
            getAllExternalLinks(link)


allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

11111>>link
https://www.oreilly.com
11111>>link
https://learning.oreilly.com/accounts/login-check/
11111>>link
https://www.oreilly.com/online-learning/try-now.html
11111>>link
https://www.oreilly.com/online-learning/teams.html
11111>>link
https://www.oreilly.com/online-learning/business.html
11111>>link
https://www.oreilly.com/online-learning/government.html
11111>>link
https://www.oreilly.com/online-learning/academic.html
11111>>link
https://www.oreilly.com/online-learning/individuals.html
11111>>link
https://www.oreilly.com/online-learning/features.html
11111>>link
https://www.oreilly.com/online-learning/feature-certification.html
11111>>link
https://www.oreilly.com/online-learning/intro-interactive-learning.html
11111>>link
https://www.oreilly.com/online-learning/live-events.html
11111>>link
https://www.oreilly.com/online-learning/feature-answers.html
11111>>link
https://www.oreilly.com/radar/
11111>>link
https://www.oreilly.com/content-marketing-solutions.html
11111>>link
https://w

11111>>link
https://www.oreilly.com/about/
11111>>link
http://www.oreilly.com
11111>>link
http://www.oreilly.com/about/sebastopol_directions.html
11111>>link
http://www.oreilly.com/about/boston_directions.html
11111>>link
http://www.oreilly.com.cn/
11111>>link
http://www.oreilly.co.jp
22222>>link
11111>>link
http://oreilly.com/oreilly/privacy.html
22222>>link
11111>>link
https://www.oreilly.com/terms/
11111>>link
https://learning.oreilly.com/membership-agreement/
11111>>link
https://www.safaribooksonline.com/membership-agreement/
11111>>link
https://www.safaribooksonline.com/terms/
11111>>link
https://feedback-form.truste.com/watchdog/request
11111>>link
https://edpo.com/gdpr-data-request/
11111>>link
https://www.oreilly.com/privacy-cookies.html
11111>>link
https://www.oreilly.com/privacy-categories-disclosed-for-business-purpose.html
11111>>link
https://www.oreilly.com/privacy-categories-disclosed-for-valuable-consideration.html
11111>>link
https://www.privacyshield.gov
11111>>link
ht

HTTPError: HTTP Error 404: Not Found