### Scraping to create a mock text dataset

Scraping https://archiveofourown.org/tags/Halo%20(Video%20Games)%20*a*%20Related%20Fandoms/works to create a mock dataset using BeautifulSoup and requests

1. Importing the required libraries

In [None]:
# Import all the necessary libraries

import os
import requests
import bs4
import scrapy
import time

2. Function that scrapes the entire body of text in one url - inside the url of one work

In [None]:
def scrape_oneWork(work_url):
    
    """
    A function that scrapes all the content of one single work
    input  : 'str' url of the page
    returns: list('str', 'str', 'str'...) list  of strings that make up the body of the work"""
    
    url = work_url+'?view_full_work=true'    # view the whole work in one page instead of chapter-wise (which is default)
    
    resp = requests.get(url)
    
    if resp.status_code == 200:              # HTTP ping successful
        soup = bs4.BeautifulSoup(resp.text, 'lxml')
        complete_text = [t.getText() for t in soup.select('p')]
        
    else:
        print('Bad response :(')
        return None
        
    return complete_text

3. Function that sorts through all the work links in one page (1 of 142)
> ao3 has a format of 20 works appearing in one page

In [None]:
def scrape_allWorks_in_1Pg(page_url, work_title_tag):
    
    """
    Hit all the work links in the current page
    input  : 'str' url of the current page"""
    
    response = requests.get(page_url)
    
    if response.status_code == 200:                                # if HTTP response successful
        
        soup = bs4.BeautifulSoup(response.text, 'lxml')            # parse the html
        
        work_titles  = [t.getText() for t in soup.select(tag)]     # obtain all the text from the work
        
        all_links    = [t.get("href") for t in soup.select(tag)]   # all links- both works + authors
        
        # only select work links and not the authors... filter using presence of the substring "/works"
        work_links   = ["https://archiveofourown.org"+link for link in all_links if '/works' in link]
        
        
        for i, link in enumerate(work_links):           # for each of the work links do...
            
            time.sleep(5)                               # wait for 5 seconds before calling scrape_oneWork()
            
            body = "\n".join(scrape_oneWork(link))      # join each paragraph str, separated by "\n" into one str
            
            f = open("ao3_halo.txt", "a")               # Open file in append mode to prevent overwriting
            f.writelines(body)
            f.close()
    
    else:
        
        print('Bad response :(')

4. Parent function that navigates every page under our category of interest, beginning at page=1 from the home url it is called with

In [None]:
def scrape_all_pages(home_url):
    
    """
    Make a parent function that goes through every single page of the ao3 halo's 2800~ works..."""
    
    for i in range(142):                        # totally 142 pages for this category
        
        print(f"Page {i} \n")
        
        page = i+1                              # current page
        page_url = home_url+f'?page={page}'     # completing the url by appending the current page number
        
        # Scrape all the works on this page
        scrape_allWorks_in_1Pg(page_url, "div.header.module h4.heading a")
        
        
# scrape_all_pages(home_url)    # test function call

In [None]:
home_url = "https://archiveofourown.org/tags/Halo%20(Video%20Games)%20*a*%20Related%20Fandoms/works"


if __name__ == "__main__":
    
    scrape_all_pages(url)