In [15]:

import requests, time, pandas as pd
from pprint import pprint

BASE = "https://openlibrary.org"
HEADERS = {"User-Agent": "BookRecsEduBot/1.0 (+https://example.edu)"}
session = requests.Session(); session.headers.update(HEADERS)


In [16]:

# Start test
subjects = [
    "fiction",
    "fantasy",
    "mystery",
    "history",
    "science_fiction",
]
subjects


['fiction', 'fantasy', 'mystery', 'history', 'science_fiction']

## 2) Fetch ONE subject page (peek at the structure)

In [17]:
# Fetch ONE subject page

subject = subjects[0] 
offset = 0
limit = 20

url = f"{BASE}/subjects/{subject}.json?limit={limit}&offset={offset}"
print(url)
resp = session.get(url, timeout=30)
resp.raise_for_status()
data = resp.json()

print("Keys at top level:", list(data.keys()))
print("How many works returned:", len(data.get("works", [])))
pprint(data.get("works", [])[0])  

https://openlibrary.org/subjects/fiction.json?limit=20&offset=0
Keys at top level: ['key', 'name', 'subject_type', 'work_count', 'works']
How many works returned: 20
{'authors': [{'key': '/authors/OL21594A', 'name': 'Jane Austen'}],
 'availability': {'__src__': 'core.models.lending.get_availability',
                  'available_to_borrow': False,
                  'available_to_browse': False,
                  'available_to_waitlist': False,
                  'identifier': 'bwb_KS-179-237',
                  'is_browseable': False,
                  'is_lendable': False,
                  'is_previewable': True,
                  'is_printdisabled': False,
                  'is_readable': True,
                  'is_restricted': False,
                  'isbn': None,
                  'last_loan_date': None,
                  'last_waitlist_date': None,
                  'num_waitlist': None,
                  'oclc': None,
                  'openlibrary_edition': 'OL50444320M',
    

In [18]:
# Normalize ONE work into a friendly Python dict
def normalize_work(work, subject_name):
    # Title & key
    title = work.get("title") or ""
    key = work.get("key") or ""

    # Authors list (names)
    authors = []
    for a in (work.get("authors") or []):
        name = a.get("name")
        if name:
            authors.append(name)
    authors_str = ", ".join(authors) if authors else None

    # Subjects/genres
    subjects_list = work.get("subject") or work.get("subjects") or []
    # Also include the subject we queried as a tag
    if subject_name.replace("_", " ") not in [str(s).lower() for s in subjects_list]:
        subjects_list = list(subjects_list) + [subject_name.replace("_", " ")]
    subjects_str = ", ".join(map(str, subjects_list[:20])) if subjects_list else None

    # first publish year & cover
    first_year = work.get("first_publish_year")
    cover_id = work.get("cover_id")
    cover_url = f"https://covers.openlibrary.org/b/id/{cover_id}-L.jpg" if isinstance(cover_id, int) else None

    return {
        "work_key": key,
        "title": title,
        "authors": authors_str,
        "subjects": subjects_str,
        "first_publish_year": first_year,
        "cover_url": cover_url
    }

sample = normalize_work(data["works"][0], subject)
pprint(sample)


{'authors': 'Jane Austen',
 'cover_url': 'https://covers.openlibrary.org/b/id/14348537-L.jpg',
 'first_publish_year': 1813,
 'subjects': 'Fiction, Romance, Historical, Regency, British and Irish fiction '
             '(fictional works by one author), Brothers and sisters, '
             'Courtship, Drama, English fiction, English literature, Families, '
             'Family, Family life, Family relations, Fiction Classics, '
             'History, Interpersonal relations, Literary Fiction, Love '
             'stories, manners, Manners and customs, marriage, Sisters',
 'title': 'Pride and Prejudice',
 'work_key': '/works/OL66554W'}


In [19]:
# Turn one subject page into a DF
rows = [normalize_work(w, subject) for w in data.get("works", [])]
df = pd.DataFrame(rows)
df.head()


Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url
0,/works/OL66554W,Pride and Prejudice,Jane Austen,"Fiction, Romance, Historical, Regency, British...",1813,https://covers.openlibrary.org/b/id/14348537-L...
1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,"Alice (fictitious character : carroll), fictio...",1865,https://covers.openlibrary.org/b/id/10527843-L...
2,/works/OL8193416W,The Picture of Dorian Gray,Oscar Wilde,British and irish fiction (fictional works by ...,1890,https://covers.openlibrary.org/b/id/14314858-L...
3,/works/OL21177W,Wuthering Heights,Emily Brontë,British and irish fiction (fictional works by ...,1846,https://covers.openlibrary.org/b/id/12818862-L...
4,/works/OL8193497W,A Christmas Carol,Charles Dickens,"Ghost stories, Readers, Ebenzer Scrooge (Ficti...",1843,https://covers.openlibrary.org/b/id/13299222-L...


In [20]:
 # collect a few pages
def fetch_subject(subject, target=100, per_page=50, sleep=0.2):
    rows = []
    seen = set()
    offset = 0
    while len(rows) < target:
        url = f"{BASE}/subjects/{subject}.json?limit={per_page}&offset={offset}"
        r = session.get(url, timeout=30)
        r.raise_for_status()
        js = r.json()
        works = js.get("works", [])
        if not works:
            break
        for w in works:
            rec = normalize_work(w, subject)
            if rec["work_key"] and rec["work_key"] not in seen:
                seen.add(rec["work_key"])
                rows.append(rec)
                if len(rows) >= target:
                    break
        offset += per_page
        time.sleep(sleep)
    return pd.DataFrame(rows)

demo_df = fetch_subject("fiction", target=150, per_page=50)
len(demo_df), demo_df.head(3)


(150,
             work_key                             title        authors  \
 0    /works/OL66554W               Pride and Prejudice    Jane Austen   
 1   /works/OL138052W  Alice's Adventures in Wonderland  Lewis Carroll   
 2  /works/OL8193416W        The Picture of Dorian Gray    Oscar Wilde   
 
                                             subjects  first_publish_year  \
 0  Fiction, Romance, Historical, Regency, British...                1813   
 1  Alice (fictitious character : carroll), fictio...                1865   
 2  British and irish fiction (fictional works by ...                1890   
 
                                            cover_url  
 0  https://covers.openlibrary.org/b/id/14348537-L...  
 1  https://covers.openlibrary.org/b/id/10527843-L...  
 2  https://covers.openlibrary.org/b/id/14314858-L...  )

In [21]:
 # Combine subjects 
def collect(subjects, target_total=300, per_page=50, sleep=0.2):
    all_rows = []
    seen = set()
    for subj in subjects:
        if len(all_rows) >= target_total: break
        part = fetch_subject(subj, target=max(50, target_total//len(subjects)), per_page=per_page, sleep=sleep)
        for _, row in part.iterrows():
            if row["work_key"] in seen:
                continue
            seen.add(row["work_key"])
            all_rows.append(row.to_dict())
    df_all = pd.DataFrame(all_rows)
    return df_all

small_df = collect(subjects, target_total=300, per_page=50)
len(small_df), small_df.head(3)


(240,
             work_key                             title        authors  \
 0    /works/OL66554W               Pride and Prejudice    Jane Austen   
 1   /works/OL138052W  Alice's Adventures in Wonderland  Lewis Carroll   
 2  /works/OL8193416W        The Picture of Dorian Gray    Oscar Wilde   
 
                                             subjects  first_publish_year  \
 0  Fiction, Romance, Historical, Regency, British...                1813   
 1  Alice (fictitious character : carroll), fictio...                1865   
 2  British and irish fiction (fictional works by ...                1890   
 
                                            cover_url  
 0  https://covers.openlibrary.org/b/id/14348537-L...  
 1  https://covers.openlibrary.org/b/id/10527843-L...  
 2  https://covers.openlibrary.org/b/id/14314858-L...  )

In [22]:
# Save a small 249 rows CSV 
small_df.to_csv("openlibrary_small_checkpoint.csv", index=False)
print("Saved", len(small_df), "rows -> openlibrary_small_checkpoint.csv")


Saved 240 rows -> openlibrary_small_checkpoint.csv


In [24]:
# Scale up to 2k rows
bigger_subjects = subjects + ["romance", "young_adult", "biography", "horror", "philosophy", "science"]
big_df = collect(bigger_subjects, target_total=2000, per_page=100, sleep=0.2)
print("Total rows collected:", len(big_df))
big_df.head(3)


Total rows collected: 1658


Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url
0,/works/OL66554W,Pride and Prejudice,Jane Austen,"Fiction, Romance, Historical, Regency, British...",1813,https://covers.openlibrary.org/b/id/14348537-L...
1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,"Alice (fictitious character : carroll), fictio...",1865,https://covers.openlibrary.org/b/id/10527843-L...
2,/works/OL8193416W,The Picture of Dorian Gray,Oscar Wilde,British and irish fiction (fictional works by ...,1890,https://covers.openlibrary.org/b/id/14314858-L...


In [26]:
# Drop exact duplicates by work key
big_df = big_df.drop_duplicates(subset=["work_key"]).reset_index(drop=True)

# Keep essential columns
cols = ["work_key","title","authors","subjects","first_publish_year","cover_url"]
for c in cols:
    if c not in big_df.columns:
        big_df[c] = None
big_df = big_df[cols]

print("After dedupe:", len(big_df))
big_df.sample(5)


After dedupe: 1658


Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url
257,/works/OL7989179W,Vikram and the Vampire,"Richard Francis Burton, Ernest Henry Griset, I...","Fantasy, Fiction, Historical Fiction, Classic ...",1870,https://covers.openlibrary.org/b/id/1758137-L.jpg
1480,/works/OL100196W,"The ""genius""",Theodore Dreiser,"American fiction, Translations into Russian, R...",1915,https://covers.openlibrary.org/b/id/8369941-L.jpg
150,/works/OL81180W,Lady Chatterley's Lover,D. H. Lawrence,"Sexual behavior, Married women, Adultery, Nobi...",1900,https://covers.openlibrary.org/b/id/12983362-L...
1615,/works/OL21331191W,Handbook of Flowering,Abraham H. Halevy,"Cultivated Plants, Handbooks, manuals, Floweri...",2017,https://covers.openlibrary.org/b/id/13300590-L...
1559,/works/OL17095301W,What If?,Randall Munroe,"Mathematics, Statistics, Miscellanea, Science,...",1980,https://covers.openlibrary.org/b/id/9083979-L.jpg


In [27]:
# save big csv
big_df.to_csv("openlibrary_books.csv", index=False)
print("Saved -> openlibrary_books.csv with", len(big_df), "rows")


Saved -> openlibrary_books.csv with 1658 rows


In [28]:

# Top subjects (split the comma-separated list)
from collections import Counter
subj_counter = Counter()
for s in big_df["subjects"].dropna():
    for tok in map(str.strip, s.split(",")):
        if tok:
            subj_counter[tok.lower()] += 1

pd.DataFrame(subj_counter.most_common(20), columns=["subject","count"])


Unnamed: 0,subject,count
0,fiction,3818
1,general,693
2,history,349
3,science fiction,346
4,fantasy,344
5,romance,313
6,juvenile fiction,278
7,mystery & detective,268
8,classic literature,255
9,children's fiction,254
