In [None]:
%matplotlib inline

# imports
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

# create handle to BeautifulSoup instance
soup = BeautifulSoup(open("./data/gu-home.html"), "html.parser")

# save all anchors from the page
all_anchors = soup.find_all('a')
all_anchors[:10]

In [None]:
# of the first 20 links, print all the ones with text (not linked images)
for anchor in all_anchors[:20]:
    if anchor.text.strip():
        print(anchor.text.strip() + '\n' + anchor['href'] + '\n')

In [None]:
# create an array of tuples containing the text and href
links = [(a.text, a['href']) for a in all_anchors]
links[:20]

In [None]:
# use urlparse to get the FQDNs (site addresses)
from urllib.parse import urlparse

links = [(a.text, a['href'], urlparse(a['href']).netloc) for a in all_anchors]

# display example tuple
links[4:5] 

In [None]:
# create a pandas DataFrame using this array of anchor data
import pandas as pd
pd.set_option('display.max_rows', 100) 

df = pd.DataFrame(links, columns=['title','href','site'])
df

In [None]:
# view quick statistics
df.describe()

In [None]:
# only rows that have site values
df = df[df['site'] != '']
df.head(10)

In [None]:
# create a frequency count of the sites
dist = df['site'].value_counts()
dist.head(10)

In [None]:
# set style options (optional step)
plt.style.use('ggplot')

# create a bar graph to depict the frequencies
dist.plot(kind='bar', figsize=(14,6));

In [None]:
# plot only those that are seen more than once
dist[dist > 1].plot(kind='bar')