# Scraping Wikipedia Pages
### Author: Sam Eure
### Data: May 13, 2021
#### [Code in github](https://github.com/euresa/statistics/blob/master/Python%20Projects/Wikipedia_Scraping/wiki_scraper.ipynb)

In this notebook, I created a short Python script for scraping Wikipedia pages in order to obtain the hyperlinks listed in each section of the Wikipedia pages. Given a link to a Wikipedia page, this script should do the following:

1) Find the title of the page 

2) Find the header of each section

3) Gather the plain text found in each section

4) Grab all the hyperlinks referenced in the paragraphs of each section (if present)

In the second part of this notebook, I organize this data into a Pandas dataframe and then do some basic NLP on the documents I found.

### Functions and Imports

In [1]:
import re

import requests  
from bs4 import BeautifulSoup


################### INPUTS ###########################

HTML_LINK = "https://en.wikipedia.org/wiki/Statistics"

################## END OF INPUTS #####################

#Scraping
def get_soup_doc(html_link, parser = 'html.parser'):
    '''Takes in an html link and returns a BeautifulSoup document.'''
    response = requests.get(HTML_LINK)
    soup_doc = BeautifulSoup(response.content, 'html.parser')
    return(soup_doc)

def get_title(soup_doc):
    '''Returns the title of the web page.'''
    title = soup_doc.find(id='firstHeading').text
    return(title)

def get_headers(soup):
    '''Returns the header of each section in a Wikipedia page.'''
    headers = soup.find_all('span', attrs='mw-headline')
    return(headers)

def remove_footnotes(text):
    '''Drop footnote superscripts in brackets'''
    text = re.sub(r"\[.*?\]+", '', text)
    return(text)

def get_indices(soup, string_elements):
    '''Returns the a list of the starting index for each element in a list of strings.'''
    soup_string = str(soup)
    indices = [soup_string.index(string) for string in header_strings]
    return(indices)

def get_index(text, element):
    '''Attempts to get the index of an element. Returns None if not present'''
    try:
        idx = text.index(element)
        return(idx)
    except:
        return(None)
    
def combine(lists):
    '''Combines a list of lists into one list to return.'''
    combo = []
    for list_ in lists:
        combo.extend(list_)
    return(combo)

def collect_pattern_pairs(text, start_char, end_char):
    '''Returns a list of all text between sets of start_char and end_char characters.'''
    collection = []
    while get_index(text, start_char) is not None:
        start = get_index(text, start_char)
        end = get_index(text[start:], end_char) + start #make sure end is after start
        collection.append(text[start:end+len(end_char)])
        text = text[end+len(end_char):]
    return(collection)

def get_raw_paragraphs(soup, h_indices, i):
    '''Returns unprocessed HTML paragraphs.'''
    soup_string = str(soup)
    try:
        raw_text_i = soup_string[h_indices[i]:h_indices[i+1]]
    except:
        raw_text_i = soup_string[h_indices[i]:]
    paragraphs = collect_pattern_pairs(raw_text_i, "<p>", "</p>")
    return(paragraphs)

def get_paragraphs(soup, h_indices, i):
    '''Returns the text paragraphs associated with the section header specified by an index.'''
    paragraphs = get_raw_paragraphs(soup, h_indices, i)
    clean_paragraphs = [remove_HTML(p) for p in paragraphs]
    return(clean_paragraphs)

def remove_pattern_pair(text, start_char, end_char):
    '''Removes all text between the start_char and end_char and returns remaining text.'''
    while get_index(text, start_char) is not None:
        start = get_index(text, start_char)
        end = get_index(text[start:], end_char) + start #make sure end is after start
        text = text[:start] + text[end+len(end_char):]
    return(text)

def remove_substrings(text, substring_list):
    '''Removes all occurences of all substrings in a list from a text string. Returns new string.'''
    for pattern in substring_list:
        text = re.sub(pattern, "", text)
    return(text)

def remove_HTML(text):
    '''Removes the HTML elements from a substring of an HTML document and returns resulting string.'''
    to_remove = ['</a>', '</sup>', '<p>', '</p>']
    text = remove_substrings(text, to_remove)
    plain_text = remove_pattern_pair(text, '<', ">")
    plain_text = remove_footnotes(plain_text)
    return(plain_text)

def clean_wiki_links(hlinks):
    '''Completes hyperlinks to other Wikipedia pages.'''
    for i, link in enumerate(hlinks):
        if '/wiki/' in link:
            end = link.index('"')
            hlinks[i] = re.sub('/wiki/', 'https://en.wikipedia.org/wiki/', link[:end])
    return(hlinks)

def remove_internal_links(hlinks):
    '''Removes links that reference different parts of the Wikipedia page.'''
    internal_links = []
    for link in hlinks:
        if "#" == link[0]:
            print(link)
            hlinks.remove(link)
    return(hlinks)
    
def clean_cite_notes(hlinks):
    '''Removes references to links cited at the bottom of the Wikipedia page.'''
    cite_notes = []
    for i, link in enumerate(hlinks):
        if '#cite_note-' in link:
            cite_notes.append(hlinks[i])
    for c in cite_notes:
        hlinks.remove(c)
    return(hlinks)

def clean_hyperlinks(html_hyperlinks):
    '''Removes the HTML markup around hyperlinks and returns a list of hyperlinks'''
    to_remove =['<a href="', '">']
    hlinks = [remove_substrings(link, to_remove) for link in html_hyperlinks]
    hlinks = clean_wiki_links(hlinks)
    hlinks = clean_cite_notes(hlinks)
    hlinks = remove_internal_links(hlinks)
    return(hlinks)

def get_hyperlink(text):
    '''Returns a list of all hyperlinks included in a subsection of an HTML document.'''
    html_hyperlinks = collect_pattern_pairs(text, '<a href=', '>')
    hyperlinks = clean_hyperlinks(html_hyperlinks)
    return(hyperlinks)

def show_some_text(text):
    '''Returns first 100 characters in string.'''
    return(text[:80]+"...")

### Scraping the Data

In [2]:
soup = get_soup_doc(HTML_LINK)
headers = get_headers(soup)
header_strings = [str(h) for h in headers]
h_indices = get_indices(soup, header_strings)

#Printing
hyperlinks = []
para_list = []
section_headers = []
print('PAGE TITLE:',get_title(soup))
printer_count = 0
for i, head in enumerate(headers):
    section_headers.append(head.text)
    paragraphs = get_raw_paragraphs(soup, h_indices, i)
    hrefs = [get_hyperlink(p) for p in paragraphs]
    paras = [remove_HTML(p) for p in paragraphs]
    hyperlinks.append(combine(hrefs))
    para_list.append(paras)
    if printer_count < 3: #To limit printing
        printer_count = printer_count + 1
        print("\n","#"*100, '\n\tSECTION:', head.text)
        for p in paras:
            print('\n', show_some_text(p))
        print('\nLINKS:', show_some_text(str(combine(hrefs))))
        

PAGE TITLE: Statistics

 #################################################################################################### 
	SECTION: Introduction

 Statistics is a mathematical body of science that pertains to the collection, an...

 In applying statistics to a problem, it is common practice to start with a popul...

 When a census is not feasible, a chosen subset of the population called a sample...

LINKS: ['https://en.wikipedia.org/wiki/Data', 'https://en.wikipedia.org/wiki/Mathematic...

 #################################################################################################### 
	SECTION: Mathematical statistics

 Mathematical statistics is the application of mathematics to statistics. Mathema...

LINKS: ['https://en.wikipedia.org/wiki/Mathematics', 'https://en.wikipedia.org/wiki/Mat...

 #################################################################################################### 
	SECTION: History

 The early writings on statistical interference date back to 

Some of the sections don't have any paragraphs associated with them. This is because I assigned paragraphs to more specific subsections as opposed to sections as a whole. I'll organize this data into a dataframe now.

### Organizing the Data

I'll organize the sections into a Pandas dataframe.

In [3]:
import pandas as pd


def join_strings(string_list, join_char = ' \n '):
    """Joins a list of strings into one string."""
    return(join_char.join(string_list))

def get_words_from_paragraphs(p_list):
    '''Returns a list of words from a list of paragraphs.'''
    paragraph = join_strings(p_list)
    return(paragraph.split(" "))

def add_count_feature(df, feature):
    '''Returns df with new feature that is the length of the list of a different feature.'''
    df[feature+"_count"] = df.apply(lambda row : len(row[feature]), axis=1)
    return(df)


#Start processing
wiki_df = pd.DataFrame({'section': section_headers, "hyperlinks": hyperlinks, 'paragraphs': para_list})
wiki_df['words'] = wiki_df.apply(lambda row : get_words_from_paragraphs(row.paragraphs), axis=1)
wiki_df = add_count_feature(wiki_df, 'hyperlinks')
wiki_df = add_count_feature(wiki_df, 'paragraphs')
wiki_df = add_count_feature(wiki_df, 'words')
wiki_df = wiki_df.set_index('section')
wiki_df = wiki_df[~(wiki_df.paragraphs_count==0)]
wiki_df.head()

Unnamed: 0_level_0,hyperlinks,paragraphs,words,hyperlinks_count,paragraphs_count,words_count
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Introduction,"[https://en.wikipedia.org/wiki/Data, https://e...",[Statistics is a mathematical body of science ...,"[Statistics, is, a, mathematical, body, of, sc...",18,3,346
Mathematical statistics,"[https://en.wikipedia.org/wiki/Mathematics, ht...",[Mathematical statistics is the application of...,"[Mathematical, statistics, is, the, applicatio...",3,1,27
History,[https://en.wikipedia.org/wiki/Mathematics_in_...,[The early writings on statistical interferenc...,"[The, early, writings, on, statistical, interf...",51,8,758
Sampling,[https://en.wikipedia.org/wiki/Design_of_exper...,"[When full census data cannot be collected, st...","[When, full, census, data, cannot, be, collect...",10,3,242
Experimental and observational studies,"[https://en.wikipedia.org/wiki/Causality, http...",[A common goal for a statistical research proj...,"[A, common, goal, for, a, statistical, researc...",10,1,201


Now that I have the plain text from each of the sections, I can do some natural language processing to find the most popular words from each section. 

### NLP to Find Popular Words

I'll use spaCy to help with some of the natural language processing, such as identifying 'stop words'.

In [4]:
import spacy


def remove_punctuation(words):
    '''Removes punctuation and special characters.'''
    punc_list = ['.', ',', ')', '(', '/', ']', '[', '\n', ' ', ';', ':', '"',"'", '\n \n ', '-']
    no_punc = [w for w in words if w not in punc_list]
    return(no_punc)

def get_top_n_strings(str_list, n=3):
    '''Finds most popular n strings in a string list. Returns a list of tuples (word, count).'''
    word_df = pd.DataFrame({'words': str_list})
    word_vc = word_df.value_counts()
    top_strings = [(word_vc.index[i][0], word_vc[i]) for i in range(n)]
    return(top_strings)

def remove_stop_words(doc, lemmas=False):
    '''Removes "stop words" (common words like "is", "but", "and") from list of words.'''
    if lemmas:
        #Lemmas are the base form of a word. Ex: the lemma of swimming is swim.
        interesting_words = [token.lemma_ for token in doc if not token.is_stop]
    else:
        interesting_words = [token.text for token in doc if not token.is_stop]
    return(interesting_words)

def find_popular_words(nlp, words_list, n=3, lemmas=False):
    """Finds the most popular words that aren't stop words in a list of words."""
    text = join_strings(words_list, join_char=" ")
    doc = nlp(text.lower()) #make lowercase
    nice_words = remove_stop_words(doc, lemmas=lemmas)
    actual_nice_words = remove_punctuation(nice_words)
    popular_words = get_top_n_strings(actual_nice_words, n=n)
    return(popular_words)


#Start NLP

#Load language model
nlp = spacy.load('en_core_web_sm')

wiki_df['top word'] = wiki_df.apply(lambda row : find_popular_words(nlp, row.words, n=1), axis=1)
wiki_df.head()

Unnamed: 0_level_0,hyperlinks,paragraphs,words,hyperlinks_count,paragraphs_count,words_count,top word
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Introduction,"[https://en.wikipedia.org/wiki/Data, https://e...",[Statistics is a mathematical body of science ...,"[Statistics, is, a, mathematical, body, of, sc...",18,3,346,"[(data, 16)]"
Mathematical statistics,"[https://en.wikipedia.org/wiki/Mathematics, ht...",[Mathematical statistics is the application of...,"[Mathematical, statistics, is, the, applicatio...",3,1,27,"[(mathematical, 3)]"
History,[https://en.wikipedia.org/wiki/Mathematics_in_...,[The early writings on statistical interferenc...,"[The, early, writings, on, statistical, interf...",51,8,758,"[(statistics, 11)]"
Sampling,[https://en.wikipedia.org/wiki/Design_of_exper...,"[When full census data cannot be collected, st...","[When, full, census, data, cannot, be, collect...",10,3,242,"[(population, 7)]"
Experimental and observational studies,"[https://en.wikipedia.org/wiki/Causality, http...",[A common goal for a statistical research proj...,"[A, common, goal, for, a, statistical, researc...",10,1,201,"[(studies, 6)]"
