In [2]:
import requests
import pandas as pd
import warnings

import json
from IPython.display import display, HTML
import time
import random


In [3]:
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [4]:


def get_books_by_subject(subject, limit=500, batch_size=10, delay=2):
   
    all_books = []
    failures = 0
    max_failures = 5
    
    for i in range(0, limit, batch_size):
        # Calculate offset
        offset = i
        
        # Adjust batch size for last request if needed
        current_batch_size = min(batch_size, limit - i)
        
        if current_batch_size <= 0:
            break
            
        print(f"Fetching books {offset+1}-{offset+current_batch_size} of {limit}...")
        
        # Use subject API
        formatted_subject = subject.replace(" ", "_").lower()
        endpoint = f"https://openlibrary.org/subjects/{formatted_subject}.json"
        params = {
            "limit": current_batch_size,
            "offset": offset
        }
        
        try:
            # Make the request with increased timeout
            response = requests.get(endpoint, params=params, timeout=30)
            
            if response.status_code == 200:
                data = response.json()
                
                books = data.get("works", [])
                all_books.extend(books)
                
                print(f"Retrieved {len(books)} books.")
                
                # Reset failure counter on success
                failures = 0
                
                # Check if we've reached the end of available books
                if len(books) < current_batch_size:
                    print(f"Only {len(books)} books available. Stopping.")
                    break
            else:
                failures += 1
                print(f"Error: {response.status_code}. Attempt {failures} of {max_failures}")
                
                if failures >= max_failures:
                    print("Too many consecutive failures. Stopping.")
                    break
                
                # Increase delay after failure
                time.sleep(delay * 2)
                continue
                
            # Respect rate limits
            print(f"Waiting {delay} second(s) before next request...")
            time.sleep(delay + random.uniform(0, 1))  # Add small random delay
            
        except Exception as e:
            failures += 1
            print(f"Exception: {type(e).__name__}: {str(e)}. Attempt {failures} of {max_failures}")
            
            if failures >= max_failures:
                print("Too many consecutive failures. Stopping.")
                break
                
            # Increase delay after failure
            time.sleep(delay * 3)
    
    print(f"Total books retrieved: {len(all_books)}")
    return all_books

list_of_columns = ['key', 'title', 'edition_count', 'cover_id', 'cover_edition_key', 'subject', 'ia_collection', 'printdisabled', 'lending_edition', 'lending_identifier', 'authors', 'first_publish_year', 'ia', 'public_scan', 'has_fulltext', 'availability']


In [5]:
# fiction_books = get_books_by_subject("fiction", limit=10000, batch_size=10, delay=2)

In [6]:
poetry_books = get_books_by_subject("poetry", limit=100, batch_size=10, delay=2)

Fetching books 1-10 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 11-20 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 21-30 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 31-40 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 41-50 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 51-60 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 61-70 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 71-80 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 81-90 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Fetching books 91-100 of 100...
Retrieved 10 books.
Waiting 2 second(s) before next request...
Total books retrieved: 100


In [5]:
print(fiction_books[1]['subject'])

['Alice (fictitious character : carroll), fiction', 'British and irish fiction (fictional works by one author)', 'Fiction, fantasy, general', 'JUVENILE FICTION', 'classics', 'Fantasy & Magic', 'Imagination & Play', 'adventure and adventurers', 'adventure and adventurers, fiction', 'adventure stories', 'adventure travel', 'animals', 'anthropomorphism', "artists' illustrated books", 'books and reading', 'child and youth fiction', 'children', "children's fiction", "children's literature", "children's literature, english", "children's stories", "children's stories, english", 'classic literature', 'coloring books', 'croquet', 'cuentos infantiles ingleses', 'curiosidad', 'curiosidad en los niños', 'curiosity', 'curiosity in children', 'english', 'english adventure stories', 'english fantastic fiction', 'english fantasy fiction', 'english fantasy literature', 'english language', 'english literature', 'english nonsense verses', 'fairy tales', 'fantasy', 'fantasy fiction', 'fantasy in fiction',

In [6]:
fiction_df = pd.DataFrame(fiction_books)


In [9]:
display(fiction_df.head(20))
fiction_df.to_csv("../data/raw/openlibrary_fiction_10k.csv", index=False)

Unnamed: 0,key,title,edition_count,cover_id,cover_edition_key,subject,ia_collection,printdisabled,lending_edition,lending_identifier,authors,first_publish_year,ia,public_scan,has_fulltext,availability
0,/works/OL66554W,Pride and Prejudice,4035,14348537.0,OL47044678M,"[Fiction, Romance, Historical, Regency, Britis...","[365-Books-by-Women-Authors, additional_collec...",True,OL50444320M,bwb_KS-179-237,"[{'key': '/authors/OL21594A', 'name': 'Jane Au...",1813,bwb_KS-179-237,True,True,"{'status': 'open', 'available_to_browse': Fals..."
1,/works/OL138052W,Alice's Adventures in Wonderland,3546,10527843.0,OL31754751M,"[Alice (fictitious character : carroll), ficti...","[Boston_College_Library, additional_collection...",True,OL45637056M,alicesadventures0000unse_v7d2,"[{'key': '/authors/OL22098A', 'name': 'Lewis C...",1865,alicesadventures0000unse_v7d2,True,True,"{'status': 'open', 'available_to_browse': Fals..."
2,/works/OL21177W,Wuthering Heights,2850,12818862.0,OL38586477M,[British and irish fiction (fictional works by...,"[365-Books-by-Women-Authors, JaiGyan, Servants...",True,OL57648863M,wutheringheights0000kesh,"[{'key': '/authors/OL24529A', 'name': 'Emily B...",1846,wutheringheights0000kesh,True,True,"{'status': 'open', 'available_to_browse': Fals..."
3,/works/OL8193497W,A Christmas Carol,2727,13299222.0,,"[Ghost stories, Readers, Ebenzer Scrooge (Fict...","[additional_collections, americana, aozorabunk...",True,OL50494370M,christmascarol00dick_7,"[{'key': '/authors/OL24638A', 'name': 'Charles...",1843,christmascarol00dick_7,True,True,"{'status': 'open', 'available_to_browse': Fals..."
4,/works/OL53908W,Adventures of Huckleberry Finn,2552,8157718.0,OL7062714M,"[adventure and adventurers, Adventure stories,...","[additional_collections, album_recordings, ame...",True,OL48061167M,adventuresofhuck0000mark_f4w4,"[{'key': '/authors/OL18319A', 'name': 'Mark Tw...",1876,adventuresofhuck0000mark_f4w4,True,True,"{'status': 'open', 'available_to_browse': Fals..."
5,/works/OL8193416W,The Picture of Dorian Gray,2297,14314858.0,,[British and irish fiction (fictional works by...,"[additional_collections, americana, americanun...",True,OL25669538M,temunatdoryangri008800,"[{'key': '/authors/OL20646A', 'name': 'Oscar W...",1890,temunatdoryangri008800,True,True,"{'status': 'open', 'available_to_browse': Fals..."
6,/works/OL66513W,Emma,2260,9278312.0,OL13573615M,"[Social life and customs, Mate selection, Fict...","[California-State-Suggested-Reading, additiona...",True,OL50293425M,isbn_9781905716890,"[{'key': '/authors/OL21594A', 'name': 'Jane Au...",1815,isbn_9781905716890,True,True,"{'status': 'open', 'available_to_browse': Fals..."
7,/works/OL8193478W,Oliver Twist,2210,13300802.0,,"[Bildungsromans, Boys, Brigands and robbers, B...","[Princeton, americana, audio_bookspoetry, beth...",True,OL46835116M,olivertwist0000char_v6j2,"[{'key': '/authors/OL24638A', 'name': 'Charles...",1822,olivertwist0000char_v6j2,True,True,"{'status': 'open', 'available_to_browse': Fals..."
8,/works/OL450063W,Frankenstein or The Modern Prometheus,2183,12356249.0,OL35649409M,"[Frankenstein (Fictitious character), Frankens...","[additional_collections, americana, americanun...",True,OL26683337M,frankensteinormo00shel_8,"[{'key': '/authors/OL25342A', 'name': 'Mary Sh...",1818,frankensteinormo00shel_8,True,True,"{'status': 'open', 'available_to_browse': Fals..."
9,/works/OL8193465W,A Tale of Two Cities,2059,13301713.0,,"[British, British and irish fiction (fictional...","[ColumbiaUniversityLibraries, JaiGyan, america...",True,OL25709295M,a-tale-of-two-cities,"[{'key': '/authors/OL24638A', 'name': 'Charles...",1800,a-tale-of-two-cities,True,True,"{'status': 'open', 'available_to_browse': Fals..."
