Code based on:<br>
https://shravan-kuchkula.github.io/scrape_clean_normalize_gutenberg_text/#

In [1]:
##############################
#  Module: scrapeGutenberg.py
#  Author: Shravan Kuchkula
#  Date: 05/24/2019
##############################

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np


def getTitlesAndAuthors(title_and_authors):
    titles = []
    authors = []
    for ta in title_and_authors:
        titles.append(ta[0])
        authors.append(ta[1])
    return titles, authors

def getBookURLsFromBookShelf(bookshelf):
    
    # make a request and get a response object
    response = requests.get(bookshelf)
    
    # get the source from the response object
    source = response.text
    
    # construct the soup object
    soup = BeautifulSoup(source, 'html.parser')
    
    # get all the a tags
    tags = soup.find_all('a', attrs={'class': 'extiw'})
    
    # get all the urls
    urls = ["http:" + tag.attrs['href'] for tag in tags]
    
    # construct the soup
    soups = [BeautifulSoup(requests.get(url).text, 'html.parser') for url in urls]
    
    # get all the plain text files
    href_tags = [soup.find(href=True, text='Plain Text UTF-8') for soup in soups]

    # get all the book urls
    book_urls = ["http:" + tag.attrs['href'] for tag in href_tags]
    
    # get h1 tags for getting titles and authors
    h1_tags = [soup.find('h1').getText() for soup in soups]
    
    # construct titles and authors list
    title_and_authors = [re.split(r'by', tag) for tag in h1_tags]

    # some titles don't have authors, so add Unknown to author
    for ta in title_and_authors:
        if len(ta) == 1:
            ta.append("Unknown")
    
    # get the titles and authors into their own lists
    titles, authors = getTitlesAndAuthors(title_and_authors)
    
    return book_urls, titles, authors, soup

def getCategories(soup, books):
    # get all the tags
    tags = soup.find_all('a', attrs={'class': 'extiw'})

    # get all the titles
    title_id = [tag.attrs['title'] for tag in tags]

    # clean the title
    title_ids = [title.split(':')[1] for title in title_id]

    # create a new column
    books['title_id'] = title_ids

    # create a categories column
    books['category'] = ""

    # get the categories from h3 tags
    for h3 in soup.find_all('h3'):
        #print(h3.getText())
        category = h3.getText()
        h3_atags = h3.findNextSibling().find_all('a', attrs={'class': 'extiw'})
        for tag in h3_atags:
            #print(tag['title'].split(':')[1])
            book_id = tag['title'].split(':')[1]
            books['category'].iloc[np.where(books.title_id == book_id)] = category

    # get the categories from h2 tags
    for tag in soup.find_all('h2'):
        if len(tag.findChildren()) > 0:
            for t in tag.children:
                if t.getText() != 'Readers' and t.getText() != 'Uncategorized':
                    #print(t.getText())
                    category = t.getText()
                    h2_atags = tag.findNextSibling().find_all('a', attrs={'class': 'extiw'})
                    for atag in h2_atags:
                        book_id = atag['title'].split(':')[1]
                        books['category'].iloc[np.where(books.title_id == book_id)] = category

    # remaining links are uncategorized
    books['category'].iloc[np.where(books.category == '')] = 'Uncategorized'
    
    return books

In [2]:
########################################
#  Module: gutenbergPreprocessing.py
#  Author: Shravan Kuchkula
#  Date: 05/24/2019
########################################

import re
import nltk
import string
import requests
from bs4 import BeautifulSoup

def remove_gutenburg_headers(book_text):
    book_text = book_text.replace('\r', '')
    book_text = book_text.replace('\n', ' ')
    start_match = re.search(r'\*{3}\s?START.+?\*{3}', book_text)
    end_match = re.search(r'\*{3}\s?END.+?\*{3}', book_text)
    try:
        book_text = book_text[start_match.span()[1]:end_match.span()[0]]
    except AttributeError:
        print('No match found')    
    return book_text

def remove_gutenberg_footer(book_text):
    if book_text.find('End of the Project Gutenberg') != -1:
        book_text = book_text[:book_text.find('End of the Project Gutenberg')]
    elif book_text.find('End of Project Gutenberg') != -1:
        book_text = book_text[:book_text.find('End of Project Gutenberg')]
    return book_text

def getTextFromURLByRemovingHeaders(book_urls):
    book_texts = []
    for url in book_urls:
        book_text = requests.get(url).text
        book_text = remove_gutenburg_headers(book_text)
        book_texts.append(remove_gutenberg_footer(book_text))
    return book_texts

def searchPossibleStarts(pattern, book):
    match = re.search(pattern, book, flags=re.IGNORECASE)
    if match:
        return match.span()[0]
    return -1

def moveToStartOfTheBook(possible_starts, book):
    # construct start indexes
    start_indexes = [searchPossibleStarts(ps, book) for ps in possible_starts]
    
    # calculate the lowest index of the list of possible values. Use that as the start index.
    # TODO: this throws an exception when nothing is found
    min_index = min(list(filter(lambda x: x != -1, start_indexes)))
    
    if min_index > -1:
        return book[min_index:]
    else:
        print("Match not found in possible_starts, update your possible_starts")
    
    return book

In [3]:
########################################
#  Module: gutenbergTextNormalization.py
#  Author: Shravan Kuchkula
#  Date: 05/24/2019
########################################

import re
import pandas as pd
import numpy as np
import nltk
import string
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# tokenize text
def tokenize_text(book_text):
    TOKEN_PATTERN = r'\s+'
    regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
    word_tokens = regex_wt.tokenize(book_text)
    return word_tokens

def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) 
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens]) 
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

def apply_stemming_and_lemmatize(tokens, ls=LancasterStemmer(), wnl=WordNetLemmatizer()):
    return [wnl.lemmatize(ls.stem(token)) for token in tokens]

def cleanTextBooks(book_texts):
    clean_books = []
    for book in book_texts:
        book_i = tokenize_text(book)
        book_i = remove_characters_after_tokenization(book_i)
        book_i = convert_to_lowercase(book_i)
        book_i = remove_stopwords(book_i)
        book_i = apply_stemming_and_lemmatize(book_i)
        clean_books.append(book_i)
    return clean_books

def normalizedVocabularyScore(clean_books):
    v_size = [len(set(book)) for book in clean_books]
    max_v_size = np.max(v_size)
    v_raw_score = v_size/max_v_size
    v_sqrt_score = np.sqrt(v_raw_score)
    v_rank_score = pd.Series(v_size).rank()/len(v_size)
    v_final_score = (pd.Series(v_sqrt_score) + v_rank_score)/2
    
    return pd.DataFrame({'v_size': v_size,
                        'v_raw_score': v_raw_score,
                        'v_sqrt_score': v_sqrt_score,
                        'v_rank_score': v_rank_score,
                        'v_final_score': v_final_score})

def longWordVocabularySize(clean_book, minChar=10):
    V = set(clean_book)
    long_words = [w for w in V if len(w) > minChar]
    return len(long_words)

def normalizedLongWordVocabularyScore(clean_books):
    lw_v_size = [longWordVocabularySize(book) for book in clean_books]
    max_v_size = np.max(lw_v_size)
    v_raw_score = lw_v_size/max_v_size
    v_sqrt_score = np.sqrt(v_raw_score)
    v_rank_score = pd.Series(lw_v_size).rank()/len(lw_v_size)
    lw_v_final_score = (pd.Series(v_sqrt_score) + v_rank_score)/2
    
    return pd.DataFrame({'lw_v_size': lw_v_size,
                        'lw_v_final_score': lw_v_final_score})


def textDifficultyScore(clean_books):
    df_vocab_scores = normalizedVocabularyScore(clean_books)
    df_lw_vocab_scores = normalizedLongWordVocabularyScore(clean_books)
    lexical_diversity_scores = [len(set(book))/len(book) for book in clean_books]
    
    text_difficulty = (df_vocab_scores['v_final_score'] + \
                     df_lw_vocab_scores['lw_v_final_score'] + \
                     lexical_diversity_scores)/3
    
    return pd.DataFrame({'text_difficulty': text_difficulty})

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
#from scrapeGutenberg import *
#from gutenbergPreprocessing import *
#from gutenbergTextNormalization import *

In [5]:
# bookshelf url
bookshelf = 'http://www.gutenberg.org/wiki/Children%27s_Instructional_Books_(Bookshelf)'

# from the bookshelf get all the book_urls, titles, 
# authors and soup(for getting categories)
book_urls, titles, authors, soup = getBookURLsFromBookShelf(bookshelf)

# construct a books dataframe
books = pd.DataFrame({'url': book_urls, 'title': titles, 'author(s)': authors})

# get books df with categories
books = getCategories(soup, books)

# with categories
display(books.shape)
books.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


(0, 5)

Unnamed: 0,url,title,author(s),title_id,category
