# Import Important Libraries

In [2]:
from urllib.request import urlopen 
from urllib.error import HTTPError
import json 
import numpy as np
import csv
import re
import random
import pandas as pd

# Pull Author Meta-Data From Website

In [3]:
def pull_data(min_yr = 1500, max_yr = 2000, num_pgs = 20):
    # Creates a 2d numpy array, where the first column is the book id and 
    # the second column is the year that the author was born
    data = []

    for yr in range(min_yr, max_yr, 100):
        for pg in range(1, num_pgs + 1, 1):
            # Show Progress
            print(f"Processing Year {yr}, Page {pg}/{num_pgs}", end='\r')
            # Create a query that gets all books where the author is alive in the specified century at the specified page
            url = f"http://gutendex.com/books?author_year_start={yr}&author_year_start={yr + 99}&languages=en&page={pg}"
            # Pull resulting json file
            response = urlopen(url)
            data_json = json.loads(response.read()) 
            # Save book id and author birth year in data
            data_pg = np.array([(x['id'], x['authors'][0]['birth_year']) for x in data_json['results']])
            data.extend(data_pg)
            
    return np.array(data)

## Save Meta-Data if data isn't saved

In [4]:
try:
    # Check if date_data.csv exists
    date_data = pd.read_csv("data/date_data.csv").astype(int)
except FileNotFoundError:
    # If date_data.csv doesn't exist, pull neccessary data using Gutendex API
    date_data = pull_data()
    # Save resulting data in csv file
    data_csv = {'book_id': int(date_data[:,0]), 'birth_yr': int(date_data[:,1])}
    with open('data/date_data.csv', 'w') as f:
        w = csv.writer(f)
        w.writerow(data_csv.keys())
        w.writerows(zip(*data_csv.values()))
    # Pull data as pandas Dataframe for further use
    date_data = pd.read_csv("data/date_data.csv").astype(int)

In [5]:
date_data.head()

Unnamed: 0,book_id,birth_yr
0,84,1797
1,1513,1564
2,1342,1775
3,25344,1804
4,345,1847


# Get Book Text Samples

In [6]:
def get_text(book_id):
    url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
    text = urlopen(url).read()
    return text

def get_text_samples(text, num_samples = 3):
    # Get rid of Guttenberg Header and footer
    book_text = [x.strip() for x in text.decode("utf-8").split('***')][2]
    # Remove '\r' symbol
    book_text = re.sub(r"[\r]+", "", book_text)
    # split by paragraph breaks
    book_text = re.split(r"\n{2,}", book_text)
    # remove paragraphs that are less 8 average length sentences long
    book_text = list(filter(lambda x: len(x) >= (50 * 8), np.array(book_text)))
    # Randomly sample remaining paragraphs
    paragraphs = random.sample(book_text, min(num_samples, len(book_text)))
    # Replace \n with ' ' and return paragraphs
    return np.array([re.sub(r"\n", " ", p) for p in paragraphs])

In [7]:
def create_excerpt_data(data):
    # Creates a 2d numpy array of the book id and randomly sampled
    # paragraphs within the book
    book_ids = data[:,0].astype(int)
    book_samples = []
    invalid_ids = []

    for i in range(book_ids.shape[0]):
        # For each book try to access the text file
        try:
            text = get_text(book_ids[i])
        except HTTPError as err:
            # If unable to access the text file, display the error code 
            # and save the book_id in invalid_ids for logging purposes
            print(f"HTTP {err.code} Error: book_id = {book_ids[i]}")
            invalid_ids.append(book_ids[i])
            
        # Clean and randomly sample text samples
        text_samples = get_text_samples(text)
        # Combine text samples with associated book_id
        ids = np.full(len(text_samples), book_ids[i])
        # Save samples and book id into book_samples
        samples = np.array(list(zip(ids, text_samples)))
        book_samples.extend(samples)
        
        # Show Progress
        print(f"Progress: {i/book_ids.shape[0]}", end='\r')

    return np.array(book_samples), invalid_ids

## Save Excerpt Data if Data isn't Saved

In [8]:
try:
    # Check if excerpts.csv exists
    excerpt_data = pd.read_csv("data/excerpts.csv")
except FileNotFoundError:
    # If excerpts.csv doesn't exist, create necessary data
    book_samples, invalid_ids = create_excerpt_data(date_data.to_numpy())
    # Save resulting excerpts in csv file
    book_data_csv = {'book_id': book_samples[:,0], 'text': book_samples[:,1]}
    with open('data/excerpts.csv', 'w') as f:
        w = csv.writer(f)
        w.writerow(book_data_csv.keys())
        w.writerows(zip(*book_data_csv.values()))
    # Pull data as Pandas DataFrame fro further use
    excerpt_data = pd.read_csv("data/excerpts.csv")

# Combine Data into Unified Dataframe

In [9]:
data = excerpt_data.merge(date_data, how='inner').drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,book_id,text,birth_yr
0,84,"“And now, with the world before me, whither sh...",1797
1,84,"“You have been ill, very ill, and even the con...",1797
2,84,“I intended to reason. This passion is detrime...,1797
3,84,"“How is this? I must not be trifled with, and ...",1797
4,84,"“A few days after, the Turk entered his daught...",1797


# Clean Data

In [10]:
# remove entries where text is less than 50 characters long
data = data[data['text'].apply(lambda x: len(re.findall(r"\w+", x)) > 50)]
data.head()

Unnamed: 0,book_id,text,birth_yr
0,84,"“And now, with the world before me, whither sh...",1797
1,84,"“You have been ill, very ill, and even the con...",1797
2,84,“I intended to reason. This passion is detrime...,1797
3,84,"“How is this? I must not be trifled with, and ...",1797
4,84,"“A few days after, the Turk entered his daught...",1797


# Save Final Dataset

In [11]:
data.to_csv("data/data.csv", index=False)