# Scraping data about mindfulness from Wikipedia

Emilio Lehoucq - 4/17/24

## Importing libraries

In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
from re import sub
from time import sleep
from random import randint

## Defining custom functions for this script

In [2]:
def remove_excess_line_breaks(input_string):
    '''
    Function to remove excess line breaks.
    Arg: input_string (str)
    Return: output_string (str)
    Dependencies: sub from re
    '''
    return sub(r'(( \n){3,}|\n{3,})', '\n\n', input_string)

def remove_extra_spaces(string):
    '''
    Function to remove extra spaces.
    Arg: string (str)
    Return: string (str)
    Dependencies: sub from re
    '''
    return sub(r' {3,}', '  ', string)

def remove_extra_tabs(string):
    '''
    Function to remove extra tabs.
    Arg: string (str)
    Return: string (str)
    Dependencies: sub from re
    '''
    return sub(r'(( \t){3,}|\t{3,})', '\t', string)

def extract_text(soup_object):
    '''
    Function to extract text using BeautifulSoup.
    Input: BeautifulSoup object
    Output: text.
    Dependencies: BeautifulSoup from bs4
    '''
    # " " to join the bits of text together
    # Not using strip=True because it removes all leading and trailing whitespaces. I want to keep some for structure
    # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    # https://www.educative.io/answers/how-to-use-gettext-in-beautiful-soup
    text = soup_object.get_text(" ")
    text = text.replace('\xa0', ' ') # Replace non-breaking space with regular space
    text = text.strip() # Remove leading and trailing whitespaces
    text = remove_excess_line_breaks(text)
    text = remove_extra_spaces(text)
    text = remove_extra_tabs(text)
    return text

def scrape_url(url, data_frame):
  """
  Function to get some of the Wikipedia articles linked to in a Wikipedia article and add them to a data frame.
  Inputs:
    url (string)
    data_frame (Pandas data frame with columns 'url', 'title', 'source_code', 'text', 'hyperlinks')
  Output: None
  Dependencies: get from requests, BeautifulSoup from bs4, randint from random, and Pandas
  """
  # Make request
  response = get(url)
  # If request successful
  if response.status_code == 200:
    # Get HTML
    html = response.content
    # Parse HTML
    soup = BeautifulSoup(html, 'html.parser')
    # Get hyperlinks
    hyperlinks = soup.find_all('a')
    # Get relevant URLs: no None, no empty, only those to Wikipedia articles
    urls = list(set(['https://en.wikipedia.org' + link.get('href') for link in hyperlinks if link.get('href') is not None and link.get('href').strip() and link.get('href')[:6] == '/wiki/']))
    # List of things that I don't want the final URLs to have
    undesired_in_urls = ['Main_Page', 'CiteSeer', '(identifier)', 'Special:', 'Wikipedia:', 'Help:', 'Talk:', 'File:', 'Category:', 'Template_talk:', 'Template:', 'Portal:']
    # Filter URLs for any undesirable string
    urls = [url for url in urls[:] if not any(undesired in url for undesired in undesired_in_urls)]
    # Select some of the URLs
    # A person wouldn't click on all the links in a Wikipedia article. They'd click on some. Probably more at the beginning than at the end.
    # First ten percent of the URLs
    first_ten_percent = round(len(urls)*0.1)
    # Select some random URLs from the first ten percent and some random URLs from the rest
    random_numbers = [randint(0, first_ten_percent) for _ in range(7)]+[randint(first_ten_percent+1, len(urls)-1) for _ in range(3)]
    # Get the URLs
    urls = [urls[i] for i in random_numbers]
    # Add data to data frame
    data_frame.loc[len(data_frame)] = [url, soup.find('h1').text, html, extract_text(soup), urls]
  # If request fails, print some info and return None
  else:
      print("Failed to retrieve data.")
      print("Status code: ", response.status_code)
      print("URL: ", url)

def recursive_scrape(url, depth, combined_data_frame):
    """
    Recursively scrape Wikipedia articles and add to existing data frame.
    Input: url (string), depth (int), combined_data_frame (data frame)
    Output: data frame
    Dependencies: randint from random, sleep from time, and Pandas
    """
    # Create a new data frame to store the scraped data
    new_data = pd.DataFrame(columns=['url', 'title', 'source_code', 'text', 'hyperlinks'])
    # Scrape the URL
    scrape_url(url, new_data)
    # Combine the new data with the existing data
    combined_data_frame = pd.concat([combined_data_frame, new_data], ignore_index=True)
    # If not reached the maximum recursion depth, continue recursion
    if depth > 0:
        # Iterate over the hyperlinks in the new data
        for new_url in combined_data_frame[combined_data_frame['url'] == url]['hyperlinks'].iloc[0]:
            # Sleep for a bit
            sleep(randint(1, 3))
            # Recursively scrape the new URL
            combined_data_frame = recursive_scrape(new_url, depth - 1, combined_data_frame)
    # Return the combined data
    return combined_data_frame

def scrape_wikipedia(depth=0):
    """
    Recursively scrape Wikipedia articles (starting from the one on mindfulness) up to a certain depth and return data.
    Input: depth (int)
    Output: Data frame of scraped Wikipedia articles
    Dependencies: Pandas
    """
    # Data frame to store scraped data
    result_data = pd.DataFrame(columns=['url', 'title', 'source_code', 'text', 'hyperlinks'])
    # Start the recursive scraping process
    result_data = recursive_scrape('https://en.wikipedia.org/wiki/Mindfulness', depth, result_data)
    # Return data
    return result_data

# Testing basic functionality
# test_data_frame = pd.DataFrame(columns=['url', 'title', 'source_code', 'text', 'hyperlinks'])
# scrape_url('https://en.wikipedia.org/wiki/Mindfulness', test_data_frame)
# assert len(test_data_frame['hyperlinks'].iloc[0]) == 10
# test_scrape_wikipedia = scrape_wikipedia(0)
# assert len(test_scrape_wikipedia['hyperlinks'].iloc[0]) == 10
# test_scrape_wikipedia_2 = scrape_wikipedia(1)
# assert test_scrape_wikipedia_2.shape[0] == 11
# assert len(test_scrape_wikipedia_2['hyperlinks'].explode().tolist()) == 10*10+10

## Collect data

In [3]:
# Depth = 4 to simulate people clicking on one link, then another, ... (getting in a little rabbit hole haha)
df = scrape_wikipedia(4)

## Taking a look at the data

In [5]:
print(f'Shape: {df.shape}')
df.head(20)

Shape: (11111, 5)


Unnamed: 0,url,title,source_code,text,hyperlinks
0,https://en.wikipedia.org/wiki/Mindfulness,Mindfulness,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Mindfulness - Wikipedia\n\n Jump to content\n\...,"[https://en.wikipedia.org/wiki/Mudita, https:/..."
1,https://en.wikipedia.org/wiki/Mudita,Mudita,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Mudita - Wikipedia\n\n Jump to content\n\n Mai...,[https://en.wikipedia.org/wiki/Buddhism_in_Pak...
2,https://en.wikipedia.org/wiki/Buddhism_in_Paki...,Buddhism in Pakistan,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Buddhism in Pakistan - Wikipedia\n\n Jump to c...,[https://en.wikipedia.org/wiki/Greco-Buddhist_...
3,https://en.wikipedia.org/wiki/Greco-Buddhist_art,Greco-Buddhist art,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Greco-Buddhist art - Wikipedia\n\n Jump to con...,"[https://en.wikipedia.org/wiki/Roman_Empire, h..."
4,https://en.wikipedia.org/wiki/Roman_Empire,Roman Empire,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Roman Empire - Wikipedia\n\n Jump to content\n...,[https://en.wikipedia.org/wiki/Adoption_in_anc...
5,https://en.wikipedia.org/wiki/Aniconism_in_Bud...,Aniconism in Buddhism,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Aniconism in Buddhism - Wikipedia\n\n Jump to ...,[https://en.wikipedia.org/wiki/P%C4%81li_Canon...
6,https://en.wikipedia.org/wiki/Indonesia,Indonesia,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Indonesia - Wikipedia\n\n Jump to content\n\n ...,[https://en.wikipedia.org/wiki/Genetic_bottlen...
7,https://en.wikipedia.org/wiki/Indonesia,Indonesia,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Indonesia - Wikipedia\n\n Jump to content\n\n ...,"[https://en.wikipedia.org/wiki/Japan, https://..."
8,https://en.wikipedia.org/wiki/Seleucid_Empire,Seleucid Empire,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Seleucid Empire - Wikipedia\n\n Jump to conten...,[https://en.wikipedia.org/wiki/Twenty-second_D...
9,https://en.wikipedia.org/wiki/Dhy%C4%81na_in_B...,Dhyana in Buddhism,"b'<!DOCTYPE html>\n<html class=""client-nojs ve...",Dhyana in Buddhism - Wikipedia\n\n Jump to con...,[https://en.wikipedia.org/wiki/The_Eight_Great...


## Save data

In [4]:
df.to_csv('wikipedia_data.csv', index = False)