In [1]:
### This notebook scrapes character lists or novels in the Litbank corpus, from study websites CliffsNotes and SparkNotes.
### Character lists are saved as dicts in data/Litabank/characters

In [1]:
import sys
sys.path.append('../src')

from misc import save_dict, open_dict
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import glob
import pickle

In [2]:
filePaths =  glob.glob("Litbank/texts/"+"*.txt")

In [3]:
def get_characters_cliff_notes(html):
    """
    Take html object from a URL as input, return lsit of character naems as output.
    Character names may need cleaning afterwards.
    """
    # html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")

    charSections = soup.find_all("p", class_="litNoteText")

    charNames = []

    for section in charSections:
        if section.strong != None:

            if section.strong.string != None:

                charNames.append(section.strong.string)

            else: 
                charNames.append(section.strong.a.string)


        elif section.b != None:
            charNames.append(section.b.string)

        else:
            charNames.append(None)

    return charNames

In [4]:
def get_characters_spark_notes(html):
    """
    Take html object from a URL as input, return lsit of character naems as output.
    Character names may need cleaning afterwards.
    """
    soup = BeautifulSoup(html, "html.parser")

    characters = None

    # for tag in soup.find_all("meta"):
    #     if tag.get("name", None) == "keywords":
    #         characters = tag.get("content", None)
    #     else:
    #         pass
    
    # if characters != []:
    #     return characters.split(", ")

    # else:
    charSections = soup.find_all("h3", class_ = None)
    charNames = []
    for section in charSections:
        charNames.append(section.get_text())

    if charNames != []:
        return charNames

    else:
        for tag in soup.find_all("meta"):
            if tag.get("name", None) == "keywords":
                characters = tag.get("content", None)
            else:
                pass

        if characters != [] and characters != None:
            return characters.split(", ")
        else:
            print("couldn't find characters")

In [5]:
def get_id_and_title(filePath):
    '''
    Returns predicted Litbank story ID and story title, from the file path.
    Story title has format like: the-wind-in-the-willows
    '''
    storyName = filePath.split("/")[-1]
    storyName = storyName.split(".txt")[0]

    storyID = int(re.search(r'\d+', storyName).group())

    storyName = re.sub(r'[0-9]+', '', storyName)
    storyName  = storyName.strip("_")
    storyName = storyName.replace('_','-')
    storyName = storyName.replace("'",'')

    return storyID, storyName

In [6]:
def clean_character_list_spark(charList):
    '''
    returns cleaned character list from SparkNotes
    '''
    cleanCharlist = charList[0]

    if "characters" in cleanCharlist[0]:
        del cleanCharlist[0]

    for i, char in enumerate(cleanCharlist):
        cleanCharlist[i] = char.strip()

    return cleanCharlist

In [None]:
def clean_character_list_cliff(charList):
    '''
    returns cleaned character list from CliffsNotes
    '''
    cleanCharList = []

    for list in charList:
        for char in list:
            cleanCharList.append(char)

    for j in range(len(cleanCharList)-1, -1, -1):
        if cleanCharList[j] == None:
            del cleanCharList[j]

    for j in range(len(cleanCharList)-1, -1, -1):
        if "Continued on next".lower() in cleanCharList[j].lower():
            del cleanCharList[j]

    for i, char in enumerate(cleanCharList):
        cleanCharList[i] = char.strip()

        cleanCharList[i] = cleanCharList[i].replace("\xa0","")

    return cleanCharList

In [7]:
### Run this cell to iterate through each text file name and: 
# see if character list is available on CliffsNotes
# if so parse webpage
# extract characters from the webpage
# save to dictionary

characterDictCliff = {}

for i, filePath in enumerate(filePaths):

    # get story name from the file path
    storyID, storyName = get_id_and_title(filePath)

    # get starting letter 
    startLetter = storyName.replace("the-","")
    startLetter = startLetter[0]
    
    # construct cliff notes URL & get page
    
    pages = []

    try:   
        url = "https://www.cliffsnotes.com/literature/" + startLetter + "/" + storyName +"/character-list"
        pages.append(urlopen(url))
        successfulURL = url
        
        
    except:

        try:
            url = "https://www.cliffsnotes.com/literature/" + startLetter + "/" + storyName.replace("-","") +"/character-list"
            pages.append(urlopen(url))
            successfulURL = url
        
        except:
            try:
                url = "https://www.cliffsnotes.com/literature/" + startLetter + "/" + (storyName.replace("the-","")) +"/character-list"
                pages.append(urlopen(url))
                successfulURL = url

            except:
                try:
                    url = "https://www.cliffsnotes.com/literature/" + startLetter + "/" + (storyName.replace("the-","")).replace("-","") +"/character-list"
                    pages.append(urlopen(url))
                    successfulURL = url

                except:
                    continue
    
    # get multiple pages, if multiple pages exist:
    multiPage = True
    pageNum = 2
    while multiPage:
        urlTry = successfulURL + "-" + str(pageNum)

        try:
            pages.append(urlopen(urlTry))
            pageNum += 1

        except:
            multiPage = False

    
    # extract character names from page and add to dict
    characterDictCliff[storyID] = []

    for page in pages:
        characterDictCliff[storyID].append(get_characters_cliff_notes(page))

    
    print(i,'complete')

In [8]:
### Run this cell to do the same as above but for SparkNotes

characterDictSpark = {}

for i, filePath in enumerate(filePaths):

    # get story name from the file path
    storyID, storyName = get_id_and_title(filePath)

    # construct SparkNotes notes URL

    try:   
        url = "https://www.sparknotes.com/lit/" + storyName +"/characters/"
        page = urlopen(url)

    except:
        try:
            url = "https://www.sparknotes.com/lit/" + storyName.replace("-","") +"/characters/"
            page = urlopen(url)

        except:
            try:
                url = "https://www.sparknotes.com/lit/" + storyName.replace("the-","") +"/characters/"
                page = urlopen(url)

            except:
                try:
                    url = "https://www.sparknotes.com/lit/" + (storyName.replace("the-","")).replace("-","") +"/characters/"
                    page = urlopen(url)

                except:
                    continue

    html = page.read().decode("utf-8")
    characterDictSpark[storyID] = [get_characters_spark_notes(html)]

    print(i,"complete")

In [9]:
##### create dictionary of litBank IDs and titles to help with analysis
litBankDict = {}

for path in filePaths:
    storyID, storyName = get_id_and_title(path)
    storyName = storyName.replace("-"," ")

    litBankDict[storyID] = storyName

In [10]:
### clean character lists in each dictionary
characterDictSparkClean= {}
for key, value in characterDictSpark.items():
    characterDictSparkClean[key] = clean_character_list_spark(value)

characterDictCliffClean = {}
for key, value in characterDictCliff.items():
    characterDictCliffClean[key] = clean_character_list_cliff(value)

In [None]:
### save litbank dict, characterCliffDict and characterSparkNotesDict
save_dict(characterDictCliffClean, "LitBank/characters/litbank_character_lists_from_cliff.p")
save_dict(characterDictSparkClean, "LitBank/characters/litbank_character_lists_from_spark.p")
save_dict(litBankDict, "LitBank/characters/litbank_ids_and_titles_dict.p")