# Exercise 3

Some web scraping and graph stuff

- Scrape links of Wikipedia
- Sort them in a graph

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

import networkx as nx

import numpy as np
import matplotlib.pyplot as plt

In [2]:
max_nodes = 10000

def get_links_wikipedia(name):
    urlstart = 'https://en.wikipedia.org/wiki/'
    link = urlstart + name
    html = urlopen(link).read()
    soup = BeautifulSoup(html, "html.parser")
    links = []
    
    for link in soup.find_all('a', href=True):
        if link['href'][0] != '/': # sort out internal links
            continue
        if not link['href'].startswith('/wiki/'):
            continue
        # sort out some other key words for functional pages
        if 'File:' in link['href']:
            continue
        if 'Category:' in link['href']:
            continue
        if 'Wikipedia:' in link['href']:
            continue
        if 'Special:' in link['href']:
            continue
        if 'Help:' in link['href']:
            continue
        if 'Portal:' in link['href']:
            continue
        if 'Talk:' in link['href']:
            continue
        if 'Main_Page' in link['href']:
            continue
        if 'Template' in link['href']:
            continue
        links.append(link['href'][6:]) #remove /wiki/
    return links

## Inserts the name into the graph and looks up all the links on the wikipedia article
## returns the graph and list of newly inserted node names
def insert_link(graph, name):
    #if name in graph.nodes:
    #    graph
    #    return graph, []
    
    if len(graph.nodes) >= max_nodes:
        return graph, []
    
    new_links = get_links_wikipedia(name)
    existing_links = graph.nodes

    #if len(existing_links) + len(new_links) >= max_nodes: # check if all the new nodes have space
    #    return graph, []
    
    new_inserted = []
    for link in new_links:
        if link not in graph.nodes:
            graph.add_node(link)
            new_inserted.append(link)
        graph.add_edge(name, link)
    
    return graph, new_inserted


def create_graph(start):
    i = 0
    counter = 1
    look_up = []
    graph = nx.Graph()

    graph, new_links = insert_link(graph, start)
    look_up = look_up + new_links
    #return graph,look_up
    
    while len(look_up) < max_nodes:
        new_links = []
        try:
            graph, new_links = insert_link(graph, look_up[i])
        except IndexError:
            print(i)
            return graph, look_up
        except URLError:
            return graph, look_up

        look_up = look_up + new_links
        if len(new_links) > 0:
            print(f'Found {len(new_links)} new links at {look_up[i]}. Total {len(look_up)} after scraping {i} links.')
        else:
            print(f'Found  no new links at {look_up[i]}.')
            
        i += 1

    return graph, look_up

# creates a graph with nodes from a specific start point 
# does not create edges
def create_nodes(start, max_nodes=1000):
    pages = [start]
    i = 0
    while len(pages) < 3 * max_nodes:
        new_pages = get_links_wikipedia(pages[i])
        pages = pages + new_pages
        i += 1
    pages = pages
    G = nx.Graph()
    i = 0
    while len(G.nodes) < max_nodes:
        G.add_node(pages[i])
        i += 1
    return G

def create_edges(graph):
    for node in graph.nodes:
        links = get_links_wikipedia(node)
        edges_added = 0
        for link in links:
            if link in graph.nodes:
                graph.add_edge(node, link)
                edges_added += 1
        print(f'Added {edges_added} edges to node {node}')

In [None]:
graph = create_nodes('Technical_University_of_Dortmund')
create_edges(graph)

Added 224 to node Technical_University_of_Dortmund
Added 504 to node Public_university
Added 329 to node Rector_(academia)
Added 335 to node Dortmund
Added 135 to node North_Rhine-Westphalia
Added 409 to node Germany
Added 170 to node Urban_area
Added 42 to node Suburb
Added 150 to node German_language
Added 110 to node University
Added 84 to node Ruhr_area
Added 399 to node Europe
Added 72 to node Physics
Added 107 to node Electrical_engineering
Added 54 to node Chemistry
Added 212 to node Economics
Added 49 to node Support-vector_machine
Added 3 to node RapidMiner
Added 25 to node Natural_science
Added 100 to node Engineering
Added 31 to node Pedagogy
Added 61 to node Special_education
Added 36 to node Journalism
Added 2 to node Federation_of_International_Robot-soccer_Association
Added 178 to node Ruhr_University_Bochum
Added 198 to node University_of_Duisburg-Essen
Added 8 to node Zeitgeist
Added 15 to node Monorail
Added 12 to node H-Bahn
Added 61 to node Mathematics
Added 72 to n

In [None]:
fig = plt.figure(figsize = (4,4), dpi=100)

ax = fig.add_subplot(111)
nx.draw(graph, ax = ax, with_labels=False, font_weight='bold')

#ax = fig.add_subplot(122)
#nx.draw_kamada_kawai(graph, ax = ax, with_labels=True, font_weight='bold')

In [None]:
len(list(graph.edges))