Remember to get right in this project:
- Folder structure
- Docstrings
- PEP8
- Exception handling

In [1359]:
# !python3 -m venv venv

In [1360]:
# !pip install numpy
# !pip install pandas
#!pip install matplotlib
# !pip install sqlalchemy
# !pip install ipython-sql
# !pip install python-dotenv
# !pip install psycopg2



In [1361]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv


load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [1362]:
%sql postgresql://postgres:password@localhost/books

In [1363]:
import regex as re
from pandas.tseries.offsets import *
from datetime import date
import ast

In [1364]:
import pandas as pd
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

In [1365]:
# df[df['description'].str[:4]== 'ISBN']

This is a function to create summary statistics for *string fields* to show before and after a transform:

Dropping duplicates:

In [1371]:
print(df.shape)
print(df[df.duplicated()].shape)

(52478, 25)
(50, 25)


In [1368]:
df = df.drop_duplicates()


In [1372]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            52478 non-null  object 
 1   title             52478 non-null  object 
 2   series            23470 non-null  object 
 3   author            52478 non-null  object 
 4   rating            52478 non-null  float64
 5   description       51140 non-null  object 
 6   language          48672 non-null  object 
 7   isbn              52478 non-null  object 
 8   genres            52478 non-null  object 
 9   characters        52478 non-null  object 
 10  bookFormat        51005 non-null  object 
 11  edition           4955 non-null   object 
 12  pages             50131 non-null  object 
 13  publisher         48782 non-null  object 
 14  publishDate       51598 non-null  object 
 15  firstPublishDate  31152 non-null  object 
 16  awards            52478 non-null  object

In [1370]:
df.describe(include='all')

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
count,52478,52478,23470,52478,52478.0,51140,48672,52478.0,52478,52478,...,31152,52478,52478.0,52478,51856.0,52478,51873,52478.0,52478.0,38113.0
unique,52424,49927,22802,28227,,50888,81,48072.0,44154,12448,...,8024,9215,,49908,,4651,51819,,,3776.0
top,650584.Shadowdale,Legacy,Star Wars Legends,Nora Roberts (Goodreads Author),,"هذه هي طبعة ""دار الفكر - بيروت"" وهي آخر طبعة ع...",English,9999999999999.0,[],[],...,01/01/12,[],,[],,[],https://i.gr-assets.com/images/S/compressed.ph...,,,5.2
freq,2,14,15,86,,37,42661,4354.0,4623,38712,...,225,41864,,1423,,40900,2,,,626.0
mean,,,,,4.021878,,,,,,...,,,17878.65,,92.231545,,,1984.023,22.529003,
std,,,,,0.367146,,,,,,...,,,103944.8,,5.990689,,,35153.14,369.158541,
min,,,,,0.0,,,,,,...,,,0.0,,0.0,,,0.0,-4.0,
25%,,,,,3.82,,,,,,...,,,341.0,,90.0,,,84.0,1.0,
50%,,,,,4.03,,,,,,...,,,2307.0,,94.0,,,97.0,1.0,
75%,,,,,4.23,,,,,,...,,,9380.5,,96.0,,,187.0,2.0,


Get rows where ISBN is duplicated and is not '999999999999' placeholder

In [1254]:
# duplicated_isbns = df[(df['isbn']!= '9999999999999') & (df.duplicated(subset='isbn', keep=False))]
# duplicated_isbns

Dropping rows with all nulls:

In [1255]:
df = df.dropna(how='all')

Flagging rows with duplicate isbns that are not 9999999999999 placeholder values:

In [1256]:
def flag_duplicate_isbns(df):
    """
    Flag duplicate ISBNs in the DataFrame, excluding None.

    Adds a new column 'is_duplicate_isbn' to the DataFrame. The column is set
    to True for rows with duplicate ISBNs (excluding None) and False
    otherwise.

    :param df: DataFrame with a column named 'isbn'
    :return: DataFrame with 'is_duplicate_isbn' column added
    :raises ValueError: If 'isbn' column is missing in the DataFrame
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")

        df['is_duplicate_isbn'] = False
        mask = (df['isbn'] != None) & df.duplicated(subset='isbn', keep=False)
        df.loc[mask, 'is_duplicate_isbn'] = True
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return df

# Usage example:
try:
    df = flag_duplicate_isbns(df)
except ValueError as e:
    print(e)

In [1257]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn'],
      dtype='object')

In [1258]:
def rm_duplicates(df):
    """
    Process a DataFrame by preserving rows with ISBN None 
    and removing duplicates in ISBN for other rows.

    :param df: Input DataFrame containing a column 'isbn'
    :return: Processed DataFrame with specified rows and no duplicates in ISBN
    :raises ValueError: If 'isbn' column is missing
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")
        result_df = df[(df['isbn'] == None) | ~df.duplicated(subset='isbn', keep=False)]
        return result_df

    except Exception as e:
        print(f"No changes made. An error occurred: {e}.")
        return df

#Usage:
df = rm_duplicates(df)

Removing all new lines (\n) and leading and trailing whitespace from all rows

In [1259]:
df = df.applymap(lambda r: r.strip() if type(r) == str else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if type(r) == str else r)

This will get all the rows in ISBN that start with a letter:

In [1260]:
# df[df['isbn'].str[0].str.isalpha()]


In [1261]:
# df[df['description'].str[:4]== 'ISBN']

In [1262]:
df.head(20)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09,False
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38,False
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1,False
5,19063.The_Book_Thief,The Book Thief,,Markus Zusak (Goodreads Author),4.37,Librarian's note: An alternate cover edition c...,English,9780375831003,"['Historical Fiction', 'Fiction', 'Young Adult...","['Liesel Meminger', 'Hans Hubermann', 'Rudy St...",...,['National Jewish Book Award for Children’s an...,1834276,"['1048230', '524674', '186297', '48864', '26211']",96.0,"['Molching (Germany)', 'Germany']",https://i.gr-assets.com/images/S/compressed.ph...,1372809,14168,3.8,False
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['Prometheus Hall of Fame Award (2011)', 'Retr...",2740713,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False
8,30.J_R_R_Tolkien_4_Book_Boxed_Set,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,The Lord of the Rings #0-3,J.R.R. Tolkien,4.6,"This four-volume, boxed set contains J.R.R. To...",English,9780345538376,"['Fantasy', 'Fiction', 'Classics', 'Adventure'...","['Frodo Baggins', 'Gandalf', 'Bilbo Baggins', ...",...,[],110146,"['78217', '22857', '6628', '1477', '967']",98.0,['Middle-earth'],https://i.gr-assets.com/images/S/compressed.ph...,1159802,12111,21.15,False
9,18405.Gone_with_the_Wind,Gone with the Wind,,Margaret Mitchell,4.3,"Scarlett O'Hara, the beautiful, spoiled daught...",English,9780446675536,"['Classics', 'Historical Fiction', 'Fiction', ...","[""Scarlett O'Hara"", 'Rhett Butler', 'Ashley Wi...",...,"['Pulitzer Prize for Novel (1937)', 'National ...",1074620,"['602138', '275517', '133535', '39008', '24422']",94.0,"['Atlanta, Georgia (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,1087732,11211,5.58,False
12,370493.The_Giving_Tree,The Giving Tree,,Shel Silverstein,4.37,"""Once there was a tree...and she loved a littl...",English,9780060256654,"['Childrens', 'Picture Books', 'Classics', 'Fi...",[],...,[],905731,"['556142', '204347', '94819', '27950', '22473']",94.0,[],https://i.gr-assets.com/images/S/compressed.ph...,1021534,10594,4.87,False
13,6185.Wuthering_Heights,Wuthering Heights,,"Emily Brontë, Richard J. Dunn (Editor), David ...",3.86,You can find the redesigned cover of this edit...,English,9780393978896,"['Classics', 'Fiction', 'Romance', 'Gothic', '...","['Heathcliff', 'Catherine Earnshaw', 'Edgar Li...",...,[],1342664,"['483222', '413051', '279702', '107437', '59252']",88.0,"['Yorkshire Dales, England']",https://i.gr-assets.com/images/S/compressed.ph...,981502,10275,2.73,False
15,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,,Arthur Golden,4.12,"A literary sensation and runaway bestseller, t...",English,9781400096893,"['Fiction', 'Historical Fiction', 'Romance', '...","['Sayuri Nitta', 'Mr. Bekku', 'Ichiro Tanaka',...",...,[],1717312,"['712950', '615702', '295184', '66518', '26958']",95.0,"['Kyoto (Japan)', 'Japan']",https://i.gr-assets.com/images/S/compressed.ph...,862748,8993,2.76,False


Filling ISBN field using ISBN from description field, if the ISBN field is 9999999999 or null:

Eventually rewrite this with error handling that will catch if wrong data type (non-str) is passed to the function

In [1263]:
def extract_isbn(df):
    """
    Extract and move ISBN from the description to the ISBN field if the ISBN is '9999999999999' or null.

    The function applies a regex pattern to identify ISBNs from the description field and 
    moves them to the ISBN field.

    :param df: A DataFrame containing 'isbn' and 'description' columns
    :return: The modified DataFrame
    :raises ValueError: If 'isbn' or 'description' columns are missing
    """

    if 'isbn' not in df.columns:
        raise ValueError("'isbn' column is missing in the DataFrame.")
    if 'description' not in df.columns:
        raise ValueError("'description' column is missing in the DataFrame.")

    # First regex pattern matches 13 digits optionally preceded and followed by a non-digit character
    # Second regex pattern matches 10 digits optionally preceded and followed by a non-digit character
    # Third regex pattern matches "B" and a specific pattern of 4 digits, 3 uppercase letters, 1 digit, and 1 uppercase letter: B1234XYZ7A
    # Fourth regex matches the pattern "978-", followed by one digit, and then a sequence of either digits or dashes that is at least 9 and at most 13 characters long: 978-3-16-148410-0

    isbn_pattern = (r'((?:\D)?(\d{13})(?:\D)?|(?:\D)?(\d{10})(?:\D)?|'
                    r'B\d{4}[A-Z]{3}\d{1}[A-Z]|978-\d[-\d]{9,13})')

    mask = (df['isbn'] == '9999999999999') | pd.isnull(df['isbn'])
    descriptions = df.loc[mask, 'description'].astype(str)

    extracted_isbns = descriptions.str.extract(isbn_pattern)[0].str.replace('-', '')  # Extract ISBNs and remove dashes
    mask_first_char = ~extracted_isbns.str[0].str.isdigit()
    extracted_isbns = extracted_isbns.str[1:].where(mask_first_char, extracted_isbns)

    mask_last_char = ~extracted_isbns.str[-1].str.isdigit()  # Remove non-digit last character if exists
    extracted_isbns = extracted_isbns.str[:-1].where(mask_last_char, extracted_isbns)

    df.loc[mask, 'isbn'] = extracted_isbns

    return df

df = extract_isbn(df)

In [1265]:
filtered_df = df[(df['isbn'] != '9999999999999') & ~df['isbn'].isnull()]
filtered_df.shape

(48064, 26)

Splitting out the int after the title in series into a separate series_num column: Adds to new series_num column, removes number from series column
We're not making series_num an int, bc some cases have a range of nums, e.g. 1-3

In [1267]:
# adding the series_num column: 
df['series_num'] = None

In [1271]:
def extract_series_num(df):
    """
    Extract the series number from the 'series' column in a DataFrame.

    The function looks for the pattern following a '#' character in the 'series' column,
    extracts the series number into a new column named 'series_num', and then removes
    the matched pattern from the 'series' column, along with any leading and trailing
    whitespace.

    Parameters:
    df (pd.DataFrame): DataFrame containing the 'series' column to extract from.

    Returns:
    pd.DataFrame: DataFrame with the added 'series_num' column and modified 'series' column.

    Raises:
    ValueError: If 'series' column is not found in the DataFrame or an error occurs while extracting the series number.
    """

    if 'series' not in df.columns:
        raise ValueError("'series' column missing from the DataFrame.")

    # This regex matches the character that comes after the '#' in the series column
    series_int_pattern = r'(#.*)'
    try:
        serieses = df['series'].astype(str)
        extracted_series_num = serieses.str.extract(series_int_pattern)[0]
        extracted_series_num = extracted_series_num.str.replace('#', '')
        df['series_num'] = extracted_series_num
        df['series'] = df['series'].str.replace(series_int_pattern, '', regex=True).str.strip()
    except Exception as e:
        raise ValueError("An error occurred while extracting series number.") from e

    return df

df = extract_series_num(df)


In [1272]:
df[['series', 'series_num']].head(20)

Unnamed: 0,series,series_num
0,The Hunger Games,1
1,Harry Potter,5
4,The Twilight Saga,1
5,,
6,,
8,The Lord of the Rings,0-3
9,,
12,,
13,,
15,,


In [1273]:
print(df.bookId.shape)
df.bookId.unique().shape

(48064,)


(48064,)

In [1274]:
# df['bookFormat'].value_counts()

Validation for dates - checking for cases where publish date is earlier than first publish date. 

First you need to figure out what the actual format is. Then remove all non-dates. Then set the data type. Then check for cases where publish date is earlier than first publish date.

In [1275]:

df[['publishDate', 'firstPublishDate']] = df[['publishDate', 'firstPublishDate']].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y', errors='coerce')).apply(lambda x: x.dt.normalize())

In [1276]:
def fix_dates(df):
    """
    Correct the 'firstPublishDate' and 'publishDate' in the DataFrame by:
    - Subtracting 100 years from 'firstPublishDate' if greater than today's date.
    - Subtracting 100 years from 'publishDate' if greater than today's date.
    - Subtracting 100 years from 'firstPublishDate' if greater than 'publishDate'.

    :param df: DataFrame containing the 'firstPublishDate' and 'publishDate' columns.
    :type df: pd.DataFrame
    :return: None
    :raises ValueError: If 'firstPublishDate' or 'publishDate' columns are not in DataFrame.
    """
    if 'firstPublishDate' not in df.columns or 'publishDate' not in df.columns:
        raise ValueError("'firstPublishDate' or 'publishDate' columns missing from the DataFrame.")
        
    # Identify rows where firstPublishDate is greater than today and subtract 100 years
    mask_first_publish_date = df['firstPublishDate'] > pd.Timestamp.today()
    df.loc[mask_first_publish_date, 'firstPublishDate'] -= DateOffset(years=100)

    # Identify rows where publishDate is greater than today and subtract 100 years
    mask_publish_date = df['publishDate'] > pd.Timestamp.today()
    df.loc[mask_publish_date, 'publishDate'] -= DateOffset(years=100)

    # Identify rows where firstPublishDate is greater than publishDate and subtract 100 years
    mask_first_publish_vs_publish = df['firstPublishDate'] > df['publishDate']
    df.loc[mask_first_publish_vs_publish, 'firstPublishDate'] -= DateOffset(years=100)

fix_dates(df)


In [1277]:
df[['publishDate', 'firstPublishDate']].head(10)

Unnamed: 0,publishDate,firstPublishDate
0,2008-09-14,NaT
1,2004-09-28,2003-06-21
4,2006-09-06,2005-10-05
5,2006-03-14,2005-09-01
6,1996-04-28,1945-08-17
8,2012-09-25,1955-10-20
9,1999-04-01,1936-06-30
12,1964-10-07,1864-10-28
13,2002-10-28,1947-12-28
15,2005-11-22,1997-09-23


In [1278]:
# df['publishDate'].dtypes

In [1279]:
# df.shape

In [1280]:
# rows_with_nulls = df[df.isnull().sum(axis=1) >= 13]
# print(rows_with_nulls)

Strip whitespace one more time:

In [1281]:

df = df.applymap(lambda r: r.strip() if type(r) == str else r)

Adding new edition_id column to df, bc we will drop bookId column:

In [1282]:
df['edition_id'] = df.index

In [1283]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn', 'series_num',
       'edition_id'],
      dtype='object')

In [1284]:
df[df['title'] == 'Animal Farm']

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn,series_num,edition_id
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False,,6
47085,253305.Animal_Farm,Animal Farm,,"Ian Wooldridge (Adapted by), George Orwell",4.07,George Orwell’s 1945 satire on the perils of S...,English,9781854597892,"['Fiction', 'Classics', 'Dystopia', 'Classic L...",[],...,"['207', '167', '75', '23', '17']",92.0,[],https://i.gr-assets.com/images/S/compressed.ph...,58,1,2.49,False,,47085


In [1374]:
duplicate_rows = df[df.duplicated('title', keep=False)]
duplicate_rows.shape

(4218, 25)

Creating edition df:

In [1286]:
edition_df = df[['isbn', 'rating', 'numRatings', 'likedPercent', 'bbeScore', 'bbeVotes', 'bookFormat', 'edition', 'publishDate', 'description', 'series_num', 'language', 'pages', 'publisher', 'coverImg', 'price', 'edition_id', 'is_duplicate_isbn']].copy()


In [1287]:
edition_df.head()

Unnamed: 0,isbn,rating,numRatings,likedPercent,bbeScore,bbeVotes,bookFormat,edition,publishDate,description,series_num,language,pages,publisher,coverImg,price,edition_id,is_duplicate_isbn
0,9780439023481,4.33,6376780,96.0,2993816,30516,Hardcover,First Edition,2008-09-14,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,1.0,English,374,Scholastic Press,https://i.gr-assets.com/images/S/compressed.ph...,5.09,0,False
1,9780439358071,4.5,2507623,98.0,2632233,26923,Paperback,US Edition,2004-09-28,There is a door at the end of a silent corrido...,5.0,English,870,Scholastic Inc.,https://i.gr-assets.com/images/S/compressed.ph...,7.38,1,False
4,9780316015844,3.6,4964519,78.0,1459448,14874,Paperback,,2006-09-06,About three things I was absolutely positive.\...,1.0,English,501,"Little, Brown and Company",https://i.gr-assets.com/images/S/compressed.ph...,2.1,4,False
5,9780375831003,4.37,1834276,96.0,1372809,14168,Hardcover,First American Edition,2006-03-14,Librarian's note: An alternate cover edition c...,,English,552,Alfred A. Knopf,https://i.gr-assets.com/images/S/compressed.ph...,3.8,5,False
6,9780451526342,3.95,2740713,91.0,1276599,13264,Mass Market Paperback,,1996-04-28,Librarian's note: There is an Alternate Cover ...,,English,141,Signet Classics,https://i.gr-assets.com/images/S/compressed.ph...,4.42,6,False


Changing all fields in df that contain a string that looks like a list literals into actual lists:

In [1288]:
cols = ['genres', 'characters', 'awards', 'ratingsByStars', 'setting']
for col in cols:
    df[col] = df[col].apply(ast.literal_eval)


In [1289]:
df["genres"].head()

0    [Young Adult, Fiction, Dystopia, Fantasy, Scie...
1    [Fantasy, Young Adult, Fiction, Magic, Childre...
4    [Young Adult, Fantasy, Romance, Vampires, Fict...
5    [Historical Fiction, Fiction, Young Adult, His...
6    [Classics, Fiction, Dystopia, Fantasy, Literat...
Name: genres, dtype: object

Creating genre df, stripping whitespace, then filling columns

In [1290]:
def create_genre_df(df):
    genre_df = df[['edition_id', 'genres']].copy()
    genre_df['genres'] = genre_df['genres'].apply(lambda genres: [genre.strip() for genre in genres])
    genre_df = genre_df.explode('genres')
    genre_df.rename(columns={'genres': 'genre'}, inplace=True)
    return genre_df

genre_df = create_genre_df(df)

In [1291]:
genre_df.head(20)

Unnamed: 0,edition_id,genre
0,0,Young Adult
0,0,Fiction
0,0,Dystopia
0,0,Fantasy
0,0,Science Fiction
0,0,Romance
0,0,Adventure
0,0,Teen
0,0,Post Apocalyptic
0,0,Action


Creating character df and stripping whitespace:

In [1292]:
def create_char_df(df):
    char_df = df[['edition_id', 'characters']].copy()
    char_df['characters'] = char_df['characters'].apply(lambda characters: [character.strip() for character in characters])
    char_df = char_df.explode('characters')
    char_df.rename(columns={'characters': 'char_name'}, inplace=True)
    return char_df
    
char_df = create_char_df(df)

In [1293]:
char_df.head(20)

Unnamed: 0,edition_id,char_name
0,0,Katniss Everdeen
0,0,Peeta Mellark
0,0,Cato (Hunger Games)
0,0,Primrose Everdeen
0,0,Gale Hawthorne
0,0,Effie Trinket
0,0,Haymitch Abernathy
0,0,Cinna
0,0,President Coriolanus Snow
0,0,Rue


Creating setting df and stripping whitespace:

In [1294]:
def create_setting_df(df):
    setting_df = df[['edition_id', 'setting']].copy()
    setting_df['setting'] = setting_df['setting'].apply(lambda setting: [setting.strip() for setting in setting])
    setting_df = setting_df.explode('setting')
    return setting_df

setting_df = create_setting_df(df)

In [1295]:
setting_df.head(20)

Unnamed: 0,edition_id,setting
0,0,"District 12, Panem"
0,0,"Capitol, Panem"
0,0,Panem (United States)
1,1,Hogwarts School of Witchcraft and Wizardry (Un...
1,1,"London, England"
4,4,"Forks, Washington (United States)"
4,4,"Phoenix, Arizona (United States)"
4,4,Washington (state) (United States)
5,5,Molching (Germany)
5,5,Germany


Creating star rating df and stripping whitespace: 
In this df, I have sliced off data where we are missing 1 or more fields from ratingsByStars, because I see no way to know which star rating the missing data belongs to.

In [1296]:
def create_star_rating_df(df):
    # Ensure the 'ratingsByStars' column is a list of lists
    if df['ratingsByStars'].apply(type).eq(str).all():
        df['ratingsByStars'] = df['ratingsByStars'].apply(eval)

    # Create a mask where 'ratingsByStars' length is 5
    mask = df['ratingsByStars'].apply(len) == 5

    # Extract the 'ratingsByStars' values where the mask is True
    ratings_data = pd.DataFrame(df.loc[mask, 'ratingsByStars'].tolist(),
                                columns=['five_star', 'four_star', 'three_star', 'two_star', 'one_star'])

    # Include the 'edition_id' column
    star_rating_df = ratings_data.assign(edition_id=df.loc[mask, 'edition_id'].values).reset_index(drop=True)

    # Optionally, strip any white spaces from string columns
    star_rating_df = star_rating_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)

    return star_rating_df

star_rating_df = create_star_rating_df(df)

In [1297]:
star_rating_df.head(10) # option 2  .5 sec

Unnamed: 0,five_star,four_star,three_star,two_star,one_star,edition_id
0,3444695,1921313,745221,171994,93557,0
1,1593642,637516,222366,39573,14526,1
2,1751460,1113682,1008686,542017,548674,4
3,1048230,524674,186297,48864,26211,5
4,986764,958699,545475,165093,84682,6
5,78217,22857,6628,1477,967,8
6,602138,275517,133535,39008,24422,9
7,556142,204347,94819,27950,22473,12
8,483222,413051,279702,107437,59252,13
9,712950,615702,295184,66518,26958,15


In [1298]:
# # Duplicated star ratings:
# duplicated_star_ratings = star_rating_df[star_rating_df.duplicated(keep=False)]
# duplicated_star_ratings


Creating award df:

In [1299]:
def create_award_df(df):
    award_df = df[['edition_id', 'awards']].copy()
    award_df = award_df.explode('awards')
    award_df = award_df.rename(columns={'awards': 'award'})
    return award_df

award_df = create_award_df(df)

In [None]:
award_df.head()

Splitting out year from award field and remove from award field and stripping whitespace::

In [1301]:
def split_year(award_df):
    year_pattern = r'\((\d{4})\)'
    award_df['year'] = award_df['award'].str.extract(year_pattern)
    award_df['award'] = award_df['award'].str.replace(year_pattern, '', regex=True)
    award_df = award_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return award_df

award_df = split_year(award_df)


In [1302]:
award_df.head(10)

Unnamed: 0,edition_id,award,year
0,0,Locus Award Nominee for Best Young Adult Book,2009
0,0,Georgia Peach Book Award,2009
0,0,Buxtehuder Bulle,2009
0,0,Golden Duck Award for Young Adult (Hal Clement...,2009
0,0,Grand Prix de l'Imaginaire Nominee for Roman j...,2010
0,0,Books I Loved Best Yearly (BILBY) Awards for O...,2012
0,0,West Australian Young Readers' Book Award (WAY...,2010
0,0,Red House Children's Book Award for Older Read...,2010
0,0,South Carolina Book Award for Junior and Young...,2011
0,0,Charlotte Award,2010


Creating creator df:

In [1303]:
def create_creator_df(df):
    creator_df = df[['edition_id', 'author']].copy()
    creator_df = creator_df.explode('author')
    creator_df = creator_df.rename(columns={'author': 'creator'})
    return creator_df

creator_df = create_creator_df(df)

In [1304]:
creator_df.head()

Unnamed: 0,edition_id,creator
0,0,Suzanne Collins
1,1,"J.K. Rowling, Mary GrandPré (Illustrator)"
4,4,Stephenie Meyer
5,5,Markus Zusak (Goodreads Author)
6,6,"George Orwell, Russell Baker (Preface), C.M. W..."


Splitting out role information into new column and stripping whitespace:

In [1305]:
def split_role(creator_df):
    role_pattern = r'\((.*?)\)'
    creator_df['role'] = creator_df['creator'].str.extract(role_pattern)
    creator_df['creator'] = creator_df['creator'].str.replace(role_pattern, '', regex=True)
    creator_df = creator_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return creator_df

creator_df = split_role(creator_df)

In [1306]:
creator_df.head(1)

Unnamed: 0,edition_id,creator,role
0,0,Suzanne Collins,


In [1307]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn', 'series_num',
       'edition_id'],
      dtype='object')

(If I were doing this again, I'd go back and create this name map at the start and immediately change the names. But I've used these variables in too many other places.) 

In [1308]:
name_map = {
    'numRatings': 'num_ratings', 
    'likedPercent': 'liked_percent', 
    'bbeScore': 'bbe_score',
    'bbeVotes': 'bbe_votes',
    'bookFormat': 'format',
    'publishDate': 'publish_date',
    'coverImg': 'cover_url'
    }

Creating book_df:

In [1309]:
def create_book_df(df):
    book_df = df[['edition_id', 'title', 'firstPublishDate', 'series', 'series_num']].copy()
    book_df = book_df.rename(columns=name_map)
    return book_df

book_df = create_book_df(df)

In [1310]:
book_df.head()


Unnamed: 0,edition_id,title,firstPublishDate,series,series_num
0,0,The Hunger Games,NaT,The Hunger Games,1.0
1,1,Harry Potter and the Order of the Phoenix,2003-06-21,Harry Potter,5.0
4,4,Twilight,2005-10-05,The Twilight Saga,1.0
5,5,The Book Thief,2005-09-01,,
6,6,Animal Farm,1945-08-17,,


In [1311]:
%%sql

UsageError: %%sql is a cell magic, but the cell body is empty. Did you mean the line magic %sql (single %)?
