In [29]:
import re

# Specify the input and output file names
input_file = 'ebook_numbers.txt'  # Change this to your input file name
output_file = 'five_digit_numbers.txt'

# Read the content of the input file
with open(input_file, 'r') as file:
    content = file.read()

# Use regular expression to find all 5-digit numbers
five_digit_numbers = re.findall(r'\b\d{5}\b', content)

# Write the found numbers to the output file
with open(output_file, 'w') as file:
    for number in five_digit_numbers:
        file.write(number + '\n')

print(f"Found {len(five_digit_numbers)} five-digit numbers and saved to {output_file}.")

Found 64365 five-digit numbers and saved to five_digit_numbers.txt.


In [114]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Specify the input file name containing the five-digit numbers
input_file = 'five_digit_numbers.txt'  # Change this to your input file name

# Read the content of the input file
with open(input_file, 'r') as file:
    content = file.read()

# Use regular expression to find all 5-digit numbers
five_digit_numbers = re.findall(r'\b\d{5}\b', content)
five_digit_numbers = five_digit_numbers[:500]  # Limit to the first 10 numbers
# Initialize a list to store the extracted data
all_data = []

# Base URL for Project Gutenberg
base_url = 'https://www.gutenberg.org/ebooks/'

# Function to extract information from a given URL
def extract_info(soup):
    # Extract text
    text = soup.get_text().replace('\n', ' ').strip()

    # Define a helper function to extract specific information
    def extract_detail(start_keyword, end_keyword_1, end_keyword_2=None):
        start_index = text.find(start_keyword)
        if start_index != -1:
            start_index += len(start_keyword)
            end_index_1 = text.find(end_keyword_1, start_index)
            end_index_2 = text.find(end_keyword_2, start_index) if end_keyword_2 else -1
            
            if end_index_1 == -1 and end_index_2 == -1:
                return text[start_index:].strip().replace('\n', ' ')
            elif end_index_1 != -1 and (end_index_2 == -1 or end_index_1 < end_index_2):
                return text[start_index:end_index_1].strip().replace('\n', ' ')
            elif end_index_2 != -1 and (end_index_1 == -1 or end_index_2 < end_index_1):
                return text[start_index:end_index_2].strip().replace('\n', ' ')
        return None

    # Extract relevant details
    author = extract_detail("Author", "Title")
    title = extract_detail("Title", "Credits")
    credits = extract_detail("Credits", "Summary")
    summary = extract_detail("Summary", "Language")
    language = extract_detail("Language", "LoC Class")
    category = extract_detail("Category", "EBook-No")
    release_date = extract_detail("Release Date", "Most Recently Updated", "Copyright Status")

    # Extract subjects using "Subject" as delimiter
    subjects_text = extract_detail("Subject", "Category")
    subjects = [subject.strip() for subject in subjects_text.split('Subject')[1:]] if subjects_text else []
    subjects = subjects[:10]  # Limit to the first 10 subjects

    # Extract the plain text link
    plain_text_link = None
    link_tag = soup.find("a", string=lambda text: text and "Plain Text" in text)
    if link_tag:
        plain_text_link = "https://www.gutenberg.org" + link_tag['href']

    # Create a dictionary for the current ebook
    ebook_data = {
        'Author': author,
        'Title': title,
        'Credits': credits,
        'Summary': summary,
        'Language': language,
        'Category': category,
        'Release Date': release_date,
        'Plain Text Link': plain_text_link
    }

    # Add subjects to the dictionary, ensuring a maximum of 10
    for i in range(1, 11):  # 1 to 10 for subjects
        ebook_data[f'Subject{i}'] = subjects[i - 1] if i - 1 < len(subjects) else None

    # Filter out unwanted text patterns
    unwanted_patterns = [
        "Copyright Status",
        "Downloads",
        "Project Gutenberg",
        "Privacy policy",
        "Terms of Use",
        "Contact Information",
        "Get Help"
    ]

    # Check for unwanted patterns in the data before adding to the list
    if not any(pattern in str(ebook_data.values()) for pattern in unwanted_patterns):
        return ebook_data
    return None

# Loop through each five-digit number to construct the URL and scrape data
for number in five_digit_numbers:
    url = f"{base_url}{number}"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        ebook_info = extract_info(soup)
        if ebook_info:  # Only append if ebook_info is not None
            all_data.append(ebook_info)
    else:
        print(f"Failed to retrieve: {url}")

# Create a DataFrame from the extracted data
df = pd.DataFrame(all_data)

# Step 6: Save to CSV
df.to_csv('extracted_ebooks_info.csv', index=False, sep=',')
print("Data saved successfully to 'extracted_ebooks_info.csv'.")


In [113]:
print(df.head())

                                              Author  \
0                       Marryat, Florence, 1837-1899   
1  Mead, Lucia True Ames, 1856-1936   LoC No.  07...   
2      Wharton, Edith, 1862-1937   LoC No.  25008793   
3                             Coppard, William Isaac   
4  Nicholson, Meredith, 1866-1947   LoC No.  2101...   

                                               Title  \
0  A moment of madness, and other stories (vol. 1...   
1  Memoirs of a millionaire   Original Publicatio...   
2  The mother's recompense   Original Publication...   
3  Cottage scenes during the cholerabeing extract...   
4  The man in the street: Papers on American topi...   

                                             Credits  \
0  Emmanuel Ackerman and the Online Distributed P...   
1  Richard Tonsing and the Online Distributed Pro...   
2  Emmanuel Ackerman, David E. Brown, and the Onl...   
3  Transcribed from the 1848 F. & J. Rivington ed...   
4  D A Alexander, David E. Brown, and the Onli