In [13]:
import os
import requests
import pandas as pd
import numpy as np
import math
import random
from datetime import datetime as dt

In [2]:
dataset_directory       = 'pgc_dataset'
random_categories       = 10              # max 234
books_per_categories    = 10           # check for max per category?

In [3]:
dfo = pd.read_csv("https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv",sep=",",dtype=str)

In [4]:
dfo.to_csv(os.path.join(dataset_directory,'pg_catalog_original.csv'),index=False)

In [5]:
dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74004 entries, 0 to 74003
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text#        74004 non-null  object
 1   Type         74004 non-null  object
 2   Issued       74004 non-null  object
 3   Title        74004 non-null  object
 4   Language     74004 non-null  object
 5   Authors      73834 non-null  object
 6   Subjects     73951 non-null  object
 7   LoCC         73749 non-null  object
 8   Bookshelves  17168 non-null  object
dtypes: object(9)
memory usage: 5.1+ MB


In [6]:
dfo.columns

Index(['Text#', 'Type', 'Issued', 'Title', 'Language', 'Authors', 'Subjects',
       'LoCC', 'Bookshelves'],
      dtype='object')

In [7]:
# filter language and category present. The main category is used to follow.
def filternancat(row):
    if type(row['category'])==float and math.isnan(row['category']):
        return False
    return True
df = dfo[(dfo['Language']=="en")]
df = df[['Text#','Title','Language','Bookshelves']]
df.rename({'Text#':'bookid','Title':'title','Language':'language','Bookshelves':'category'},axis=1,inplace=True)
df = df[df.apply(filternancat, axis=1)]
df['category'] = [catg.split(';')[0] for catg in df['category']] # get first category in Bookshelves(category) as the main category

In [8]:
df.head()

Unnamed: 0,bookid,title,language,category
0,1,The Declaration of Independence of the United ...,en,Politics
1,2,The United States Bill of Rights\r\nThe Ten Or...,en,Politics
3,4,Lincoln's Gettysburg Address\r\nGiven November...,en,US Civil War
4,5,The United States Constitution,en,United States
5,6,Give Me Liberty or Give Me Death,en,American Revolutionary War


In [11]:
def download(book_id, category, directory):
    try:
        url = f"https://www.gutenberg.org/ebooks/{book_id}.txt.utf-8"
        response = requests.get(url)
        response.raise_for_status()
        text = response.text
        os.makedirs(os.path.join(directory, category), exist_ok=True)
        filename = os.path.join(directory, category, f"book_{book_id}.txt")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text)
        print(f"Sucess!: Category {category} - book_{book_id}.txt")
    except Exception as e:
        print(f"Fail!: Category {category} - book_{book_id}.txt")

In [14]:
date_dataset = dt.now().strftime("%Y%m%d%H%M%S")
datedir = os.path.join(dataset_directory,date_dataset)
random_categories = random.choices(list(set(df['category'])), k=random_categories) 
for category in random_categories:
    book_ids = random.choices(list(df[df['category']==category]['bookid']), k=books_per_categories)
    for book_id in book_ids:
        download(book_id,category,datedir)

Sucess!: Category Architecture - book_20239.txt
Sucess!: Category Architecture - book_23668.txt
Sucess!: Category Architecture - book_19511.txt
Sucess!: Category Architecture - book_20191.txt
Sucess!: Category Architecture - book_20967.txt
Sucess!: Category Architecture - book_21511.txt
Sucess!: Category Architecture - book_19424.txt
Sucess!: Category Architecture - book_19737.txt
Sucess!: Category Architecture - book_19494.txt
Sucess!: Category Architecture - book_22990.txt
Sucess!: Category Microbiology - book_27713.txt
Sucess!: Category Microbiology - book_27713.txt
Sucess!: Category Microbiology - book_4962.txt
Sucess!: Category Microbiology - book_2938.txt
Sucess!: Category Microbiology - book_2938.txt
Sucess!: Category Microbiology - book_27778.txt
Sucess!: Category Microbiology - book_27778.txt
Sucess!: Category Microbiology - book_2938.txt
Sucess!: Category Microbiology - book_4962.txt
Sucess!: Category Microbiology - book_2938.txt
Sucess!: Category Physics - book_15207.txt
Fai