In [2]:
import pandas as pd

# adjust the path if needed
df = pd.read_csv("../data/openlibrary_books.csv")
df.shape

(1658, 6)

In [3]:
df.columns     

Index(['work_key', 'title', 'authors', 'subjects', 'first_publish_year',
       'cover_url'],
      dtype='object')

In [4]:
df.head()     

Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url
0,/works/OL66554W,Pride and Prejudice,Jane Austen,"Fiction, Romance, Historical, Regency, British...",1813,https://covers.openlibrary.org/b/id/14348537-L...
1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,"Alice (fictitious character : carroll), fictio...",1865,https://covers.openlibrary.org/b/id/10527843-L...
2,/works/OL8193416W,The Picture of Dorian Gray,Oscar Wilde,British and irish fiction (fictional works by ...,1890,https://covers.openlibrary.org/b/id/14314858-L...
3,/works/OL21177W,Wuthering Heights,Emily Brontë,British and irish fiction (fictional works by ...,1846,https://covers.openlibrary.org/b/id/12818862-L...
4,/works/OL8193497W,A Christmas Carol,Charles Dickens,"Ghost stories, Readers, Ebenzer Scrooge (Ficti...",1843,https://covers.openlibrary.org/b/id/13299222-L...


In [5]:
# drop duplicates
df = df.drop_duplicates(subset="work_key").reset_index(drop=True)

In [6]:
# simplyfy authors
df["main_author"] = df["authors"].str.split(",").str[0]

In [7]:
# normalize genres
def clean_subjects(s):
    if pd.isna(s): return None
    parts = [p.strip().lower() for p in s.split(",")]
    seen = []
    for p in parts:
        if p not in seen:
            seen.append(p)
    return ", ".join(seen[:3])

df["clean_subjects"] = df["subjects"].apply(clean_subjects)


In [13]:
# handle missing years
df["first_publish_year"] = pd.to_numeric(df["first_publish_year"], errors="coerce")

In [14]:
df.head()

Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url,main_author,clean_subjects
0,/works/OL66554W,Pride and Prejudice,Jane Austen,"Fiction, Romance, Historical, Regency, British...",1813,https://covers.openlibrary.org/b/id/14348537-L...,Jane Austen,"fiction, romance, historical"
1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,"Alice (fictitious character : carroll), fictio...",1865,https://covers.openlibrary.org/b/id/10527843-L...,Lewis Carroll,"alice (fictitious character : carroll), fictio..."
2,/works/OL8193416W,The Picture of Dorian Gray,Oscar Wilde,British and irish fiction (fictional works by ...,1890,https://covers.openlibrary.org/b/id/14314858-L...,Oscar Wilde,british and irish fiction (fictional works by ...
3,/works/OL21177W,Wuthering Heights,Emily Brontë,British and irish fiction (fictional works by ...,1846,https://covers.openlibrary.org/b/id/12818862-L...,Emily Brontë,british and irish fiction (fictional works by ...
4,/works/OL8193497W,A Christmas Carol,Charles Dickens,"Ghost stories, Readers, Ebenzer Scrooge (Ficti...",1843,https://covers.openlibrary.org/b/id/13299222-L...,Charles Dickens,"ghost stories, readers, ebenzer scrooge (ficti..."


In [15]:
df.shape

(1658, 8)

In [16]:
# download new clean DS
df.to_csv("clean_openlibrary_books.csv", index=False)
print("Saved -> openlibrary_books.csv with", len(df), "rows")

Saved -> openlibrary_books.csv with 1658 rows
