<h1 style="color:#2288ff;">Wikipedia Web Crawl Case Study - Udacity</h1>

Aim: Crawl through first valid link of a random wiki page to end up at a page related to Philosophy.

In [36]:
import requests
from bs4 import BeautifulSoup
import time
import urllib

### The <code>continue_crawl</code> function

Definition: <code>continue_crawl(search_history, target_url)</code>

<code>continue_crawl</code> should return <code>True</code> or <code>False</code> following these rules:

* if the most recent article in the search_history is the target article the search should stop and the function should return False
* If the list is more than 25 urls long, the function should return False
* If the list has a cycle in it, the function should return False
* otherwise the search should continue and the function should return True.

In [37]:
def continue_crawl(search_history, target_url):
    if search_history[-1] == target_url:
        print("You found what you were looking for! Mission Accomplished.")
        return False
    elif len(search_history) > 30:
        print("This charade has gone on for too long! Abort.")
        return False
    elif search_history[-1] in search_history[:-1]:
        print("You are getting into recursive hell! Abort.")
        return False
    else:
        return True

### The <code>while</code> loop
<code>while continue_crawl(article_chain, target_url): 
    # download html of last article in article_chain
    # find the first link in that html
    # add the first link to article_chain
    # delay for about two seconds
</code>

In [40]:
def find_first_link(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find(id='mw-content-text').find(class_="mw-parser-output")
    article_link = None
    
    if content != None:
        for elem in content.find_all("p", recursive=False):
            if elem.find("a", recursive=False):
                article_link = elem.find("a", recursive=False).get('href')
                break
    
    if not article_link:
        return
    
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
    return first_link

def web_crawl(article_chain, target_url):
    while continue_crawl(article_chain, target_url):
        article_chain.append(find_first_link(article_chain[-1]))
        time.sleep(2)
    print(article_chain)

Running the Program

In [42]:
for i in range(10):
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"

    article_chain = [start_url]
    web_crawl(article_chain, target_url)

You are getting into recursive hell! Abort.
['https://en.wikipedia.org/wiki/Special:Random', 'https://en.wikipedia.org/wiki/Formation_(geology)', 'https://en.wikipedia.org/wiki/Lithostratigraphy', 'https://en.wikipedia.org/wiki/Stratigraphy', 'https://en.wikipedia.org/wiki/Geology', 'https://en.wikipedia.org/wiki/Ancient_Greek', 'https://en.wikipedia.org/wiki/Greek_language', 'https://en.wikipedia.org/wiki/Modern_Greek', 'https://en.wikipedia.org/wiki/Colloquialism', 'https://en.wikipedia.org/wiki/Vernacular', 'https://en.wikipedia.org/wiki/Language', 'https://en.wikipedia.org/wiki/Communication', 'https://en.wikipedia.org/wiki/Meaning_(semiotics)', 'https://en.wikipedia.org/wiki/Semiotics', 'https://en.wikipedia.org/wiki/Research', 'https://en.wikipedia.org/wiki/Knowledge', 'https://en.wikipedia.org/wiki/Fact', 'https://en.wikipedia.org/wiki/Evidence', 'https://en.wikipedia.org/wiki/Logical_assertion', 'https://en.wikipedia.org/wiki/Logic', 'https://en.wikipedia.org/wiki/Ancient_Greek

You are getting into recursive hell! Abort.
['https://en.wikipedia.org/wiki/Special:Random', 'https://en.wikipedia.org/wiki/Swimming_(sport)', 'https://en.wikipedia.org/wiki/Swimming_pool', 'https://en.wikipedia.org/wiki/Human_swimming', 'https://en.wikipedia.org/wiki/Propulsion', 'https://en.wikipedia.org/wiki/Technology', 'https://en.wikipedia.org/wiki/Ancient_Greek', 'https://en.wikipedia.org/wiki/Greek_language', 'https://en.wikipedia.org/wiki/Modern_Greek', 'https://en.wikipedia.org/wiki/Colloquialism', 'https://en.wikipedia.org/wiki/Vernacular', 'https://en.wikipedia.org/wiki/Language', 'https://en.wikipedia.org/wiki/Communication', 'https://en.wikipedia.org/wiki/Meaning_(semiotics)', 'https://en.wikipedia.org/wiki/Semiotics', 'https://en.wikipedia.org/wiki/Research', 'https://en.wikipedia.org/wiki/Knowledge', 'https://en.wikipedia.org/wiki/Fact', 'https://en.wikipedia.org/wiki/Evidence', 'https://en.wikipedia.org/wiki/Logical_assertion', 'https://en.wikipedia.org/wiki/Logic', 'h