In [1]:
import requests
import pandas as pd
import time
import re

In [2]:
SOURCE_URL = "https://gutendex.com/books"


def grab_books(max_books=50, language="en", sleep=0.2):
    
    books_data = []
    url = SOURCE_URL
    fetched = 0

    while fetched < max_books:

        # this grabs 32 books by default 
        request = requests.get(url, params={"languages": language})
        request.raise_for_status()
        data = request.json()

        for book in data["results"]:
            if fetched >= max_books:
                break

            # grabbing meta data
            book_id = book["id"]
            title = book["title"]
            authors = [a["name"] for a in book["authors"]]
            download_count = book["download_count"]

            # grabbing book's plain text 
            formats = book["formats"]
            text_url = (
                formats.get("text/plain; charset=utf-8")
                or formats.get("text/plain")
            )

            # in case the gutenberg books don't have plain text (avoid request failing, can be filtered out later)
            text = None
            if text_url:
                try:
                    text = requests.get(text_url, timeout=10).text
                except Exception:
                    text = None

            # add it to pandas df 
            books_data.append({
                "gutenberg_id": book_id,
                "title": title,
                "authors": authors,
                "download_count": download_count,
                "text": text
            })

            fetched += 1
            time.sleep(sleep)  # needed for the api ??? 

        url = data["next"] # grabs the url to the next page 

    return pd.DataFrame(books_data)

# note: for 50 books, took roughly 1 minute to grab
books_df = grab_books(max_books=200)

In [3]:
# visually check dataset looks good 
books_df.head()

Unnamed: 0,gutenberg_id,title,authors,download_count,text
0,84,"Frankenstein; Or, The Modern Prometheus","[Shelley, Mary Wollstonecraft]",147950,﻿The Project Gutenberg eBook of Frankenstein; ...
1,2701,"Moby Dick; Or, The Whale","[Melville, Herman]",116573,"﻿The Project Gutenberg eBook of Moby Dick; Or,..."
2,1342,Pride and Prejudice,"[Austen, Jane]",86747,﻿The Project Gutenberg eBook of Pride and Prej...
3,1513,Romeo and Juliet,"[Shakespeare, William]",69731,﻿The Project Gutenberg eBook of Romeo and Juli...
4,26184,Simple Sabotage Field Manual,[United States. Office of Strategic Services],59276,﻿The Project Gutenberg eBook of Simple Sabotag...


In [4]:
# returns a cleaned dataset with just title, author
def preprocess_data(df): 
    df = df.copy()

    # filter out rows where text is under 10,000 chars 
    df = df[df['text'].str.len() >= 10000]


    def clean_text(text): 
        
        start_match = re.search(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", text, re.DOTALL)
        if start_match:
            text = text[start_match.end():]

        # Remove footer (everything from END marker onward)
        end_match = re.search(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", text, re.DOTALL)
        if end_match:
            text = text[:end_match.start()]

        return text.strip()


    df["text"] = df["text"].apply(clean_text)
    return df[["title", "authors", "download_count", "text"]]

books_df_cleaned = preprocess_data(books_df)
books_df_cleaned.head(5)

Unnamed: 0,title,authors,download_count,text
0,"Frankenstein; Or, The Modern Prometheus","[Shelley, Mary Wollstonecraft]",147950,"Frankenstein;\r\n\r\nor, the Modern Prometheus..."
1,"Moby Dick; Or, The Whale","[Melville, Herman]",116573,"MOBY-DICK;\r\n\r\nor, THE WHALE.\r\n\r\nBy Her..."
2,Pride and Prejudice,"[Austen, Jane]",86747,[Illustration:\r\n\r\n ...
3,Romeo and Juliet,"[Shakespeare, William]",69731,THE TRAGEDY OF ROMEO AND JULIET\r\n\r\nby Will...
4,Simple Sabotage Field Manual,[United States. Office of Strategic Services],59276,[Illustration]\r\n\r\n\r\n\r\n\r\nSimple Sabot...


In [None]:
# # double check text cleaning: 

# with pd.option_context('display.max_colwidth', None):
#     for i, t in enumerate(books_df_cleaned['text'].head(5)):
#         print(f"{i}: {t[:1000]}\n")

0: Frankenstein;

or, the Modern Prometheus

by Mary Wollstonecraft (Godwin) Shelley


 CONTENTS

 Letter 1
 Letter 2
 Letter 3
 Letter 4
 Chapter 1
 Chapter 2
 Chapter 3
 Chapter 4
 Chapter 5
 Chapter 6
 Chapter 7
 Chapter 8
 Chapter 9
 Chapter 10
 Chapter 11
 Chapter 12
 Chapter 13
 Chapter 14
 Chapter 15
 Chapter 16
 Chapter 17
 Chapter 18
 Chapter 19
 Chapter 20
 Chapter 21
 Chapter 22
 Chapter 23
 Chapter 24




Letter 1

_To Mrs. Saville, England._


St. Petersburgh, Dec. 11th, 17—.


You will rejoice to hear that no disaster has accompanied the
commencement of an enterprise which you have regarded with such evil
forebodings. I arrived here yesterday, and my first task is to assure
my dear sister of my welfare and increasing confidence in the success
of my undertaking.

I am already far north of London, and as I walk in the streets of
Petersburgh, I feel a cold northern breeze play upon my cheeks, which
braces my nerves and fi

1: MOBY-DICK;

or, THE WHALE.

By Herman Melville




In [5]:
# save data
books_df_cleaned.to_parquet("book_popularity.parquet", index=False)