### Webcrawler

In [137]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

In [138]:
# using the Main Page of english Wikipedia as starting URL
url = "https://en.wikipedia.org/wiki/Black_Country,_New_Road"

# initialize dict to store
url_info = {'source_url': [], 'link_url': [], 'link_title': []}

In [139]:
with urllib.request.urlopen(url) as response:
    parsed_page = BeautifulSoup(response.read())

while len(url_info['link_url']) < 100:

    # get the href attribute for each A tag
    url_links = list(map(lambda x: x.attrs['href'] if ('href' in x.attrs) else None, parsed_page.find_all('a')))

    # remove None values passed by <a> tags that had no href attribute
    url_links = list(filter(None, url_links))

    # convert relatives URLs to absolute URLs using the page URL they appear on
    url_links = list(map(lambda x: urllib.parse.urljoin(url, x), url_links))

    num_scraped = 0
    source = urllib.parse.urlparse(url)

    # loop over list of absolute URLs
    for link in url_links:

        parsed_link = urllib.parse.urlparse(link)

        # check if link is in same domain as source, skip if not
        if parsed_link.netloc != source.netloc:
            continue

        # check if link has same path as source (same page), skip if it does
        if parsed_link.path == source.path:
            continue

        # check if link has been collected already, skip if it has
        if link in url_info['link_url']:
            continue

        # retrieve title of url
        link_title = BeautifulSoup(urllib.request.urlopen(link)).title.get_text()

        url_info['source_url'].append(url)
        url_info['link_url'].append(link)
        url_info['link_title'].append(link_title)

        num_scraped += 1
        if num_scraped == 10: # exit for loop once we've collected 10 links on this page
            break
    
    # assign last link collected as new url
    url = url_info['link_url'][-1]

    with urllib.request.urlopen(url) as response:
        parsed_page = BeautifulSoup(response.read())

In [141]:
# convert dict to dataframe
url_df = pd.DataFrame(url_info)

# add quotes around link_title entries
url_df['link_title'] = '"' + url_df['link_title'] + '"'

In [142]:
url_df.to_csv('webcrawler.csv', sep=',', index=False, encoding='utf-8')