In [21]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re

In [28]:
# Scraping books from Project Gutenberg
def scrape_books():
    # Only 2 pages so I'd keep this part simple and manual
    # First link is https://www.gutenberg.org/ebooks/author/779
    # Second link is 
    url = 'https://www.gutenberg.org/ebooks/author/779?sort_order=title&start_index=26'
    response = requests.get(url)
    response.raise_for_status()


    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    base_url = 'https://www.gutenberg.org/'

    for link in links:
        href=link['href']
        print(f"href is {href}")
        if href.startswith('/ebooks/'):
            book_id = href.split('/')[-1]
            download_url = f"{base_url}/cache/epub/{book_id}/pg{book_id}.txt"

            try:
                book_response = requests.get(download_url)
                book_response.raise_for_status()
                file_path = f"Nietzsche_{book_id}.txt"
                with open(file_path, 'wb') as file:
                    file.write(book_response.content)
                print(f"Downloaded book with ID {book_id}")
            
            except requests.exceptions.HTTPError as e:
                print(f"Failed to download book with ID {book_id}")
                print(e)

**Note:**
I ended up doing a bit of manual work for 3 things: one, modifying the links to download books; two, transferring books to a different folder and three, removing books in non-English languages. All of this work did not take more than 5 minutes.

In [16]:
scrape_books()

href is /ebooks/author/779?sort_order=title
Downloaded book with ID 779?sort_order=title
href is /ebooks/author/779?sort_order=title
Downloaded book with ID 779?sort_order=title
href is /ebooks/58025
Downloaded book with ID 58025
href is /ebooks/7204
Downloaded book with ID 7204
href is /ebooks/52881
Downloaded book with ID 52881
href is /ebooks/52124
Downloaded book with ID 52124
href is /ebooks/7207
Downloaded book with ID 7207
href is /ebooks/53397
Downloaded book with ID 53397
href is /ebooks/28146
Downloaded book with ID 28146
href is /ebooks/51580
Downloaded book with ID 51580
href is /ebooks/5652
Downloaded book with ID 5652
href is /ebooks/51710
Downloaded book with ID 51710
href is /ebooks/38226
Downloaded book with ID 38226
href is /ebooks/1998
Downloaded book with ID 1998
href is /ebooks/19793
Failed to download book with ID 19793
404 Client Error: Not Found for url: https://www.gutenberg.org//cache/epub/19793/pg19793.txt
href is /ebooks/52263
Downloaded book with ID 52263
h

In [25]:
# Extracting text and other relevant information from the txt files of books
def extract_from_books(text):
    title_search = re.compile(r'Title: (.*)\n')
    author_search = re.compile(r'Author: (.*)\n')
    translator_search = re.compile(r'Translator: (.*)\n')
    release_date_search = re.compile(r'Release date: (.*) \[')
    language_search = re.compile(r'Language: (.*)\n')

    title = re.search(title_search, text).group(1) if re.search(title_search, text) else 'Not Available'
    author = re.search(author_search, text).group(1) if re.search(author_search, text) else 'Not Available'
    translator = re.search(translator_search, text).group(1) if re.search(translator_search, text) else 'Not Available'
    release_date = re.search(release_date_search, text).group(1) if re.search(release_date_search, text) else 'Not Available'
    language = re.search(language_search, text).group(1) if re.search(language_search, text) else 'Not Available'

    # Find the start of the main text
    start_of_text = text.find('*** START OF THE PROJECT GUTENBERG EBOOK')
    main_text = text[start_of_text + 45:] if start_of_text != -1 else 'Text not available'  # Added +45 to move past the title marker
    
    return title, author, translator, release_date, language, main_text

In [27]:
# Initialize DataFrame
columns = ["Title", "Author", "Translator", "Release Date", "Language", "Main Text"]
df = pd.DataFrame(columns=columns)

# Directory containing all the text files
directory = 'books'

# Process each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            title, author, translator, release_date, language, main_text = extract_from_books(text)
            temp_df = pd.DataFrame([{"Title": title, "Author": author, "Translator": translator, "Release Date": release_date, "Language": language, "Main Text": main_text}])
            df = pd.concat([df, temp_df], ignore_index=True)

# Display the DataFrame
print(df.head())

                                               Title  \
0                     Thoughts out of Season, Part I   
1  The Twilight of the Idols; or, How to Philosop...   
2                               Beyond Good and Evil   
3  On the Future of our Educational Institutions;...   
4              The Joyful Wisdom ("La Gaya Scienza")   

                        Author           Translator     Release Date Language  \
0  Friedrich Wilhelm Nietzsche  Anthony M. Ludovici      May 1, 2004  English   
1  Friedrich Wilhelm Nietzsche  Anthony M. Ludovici     June 7, 2016  English   
2  Friedrich Wilhelm Nietzsche        Helen Zimmern   August 1, 2003  English   
3  Friedrich Wilhelm Nietzsche        J. M. Kennedy   March 28, 2016  English   
4  Friedrich Wilhelm Nietzsche         Paul V. Cohn  August 23, 2016  English   

                                           Main Text  
0  GHTS OUT OF SEASON, PART I ***\n\n\n\n\n      ...  
1  TWILIGHT OF THE IDOLS; OR, HOW TO PHILOSOPHIZE...  
2  ND GOOD 