Remember to get right in this project:
- Folder structure
- Docstrings
- PEP8
- Exception handling

In [708]:
# !python3 -m venv venv

In [709]:
# !pip install numpy
# !pip install pandas
# !pip install sqlalchemy
# !pip install ipython-sql
# !pip install python-dotenv
# !pip install psycopg2



In [710]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv


load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [711]:
%sql postgresql://postgres:password@localhost/books

In [712]:
import regex as re
from pandas.tseries.offsets import *
from datetime import date
import ast

In [713]:
import pandas as pd
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

In [714]:
# df[df['description'].str[:4]== 'ISBN']

Dropping duplicates:

In [715]:
df = df.drop_duplicates()

Get rows where ISBN is duplicated and is not '999999999999' placeholder

In [716]:
# duplicated_isbns = df[(df['isbn']!= '9999999999999') & (df.duplicated(subset='isbn', keep=False))]
# duplicated_isbns

Dropping rows with all nulls:

In [717]:
df = df.dropna(how='all')

Flagging rows with duplicate isbns that are not 9999999999999 placeholder values:

In [718]:
def flag_duplicate_isbns(df):
    """
    Flag duplicate ISBNs in the DataFrame, excluding None.

    Adds a new column 'is_duplicate_isbn' to the DataFrame. The column is set
    to True for rows with duplicate ISBNs (excluding None) and False
    otherwise.

    :param df: DataFrame with a column named 'isbn'
    :return: DataFrame with 'is_duplicate_isbn' column added
    :raises ValueError: If 'isbn' column is missing in the DataFrame
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")

        df['is_duplicate_isbn'] = False
        mask = (df['isbn'] != None) & df.duplicated(subset='isbn', keep=False)
        df.loc[mask, 'is_duplicate_isbn'] = True
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return df

# Usage example:
try:
    df = flag_duplicate_isbns(df)
except ValueError as e:
    print(e)

In [719]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn'],
      dtype='object')

In [720]:
def rm_duplicates(df):
    """
    Process a DataFrame by preserving rows with ISBN None 
    and removing duplicates in ISBN for other rows.

    :param df: Input DataFrame containing a column 'isbn'
    :return: Processed DataFrame with specified rows and no duplicates in ISBN
    :raises ValueError: If 'isbn' column is missing
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")
        result_df = df[(df['isbn'] == None) | ~df.duplicated(subset='isbn', keep=False)]
        return result_df

    except Exception as e:
        print(f"No changes made. An error occurred: {e}.")
        return df

#Usage:
df = rm_duplicates(df)

Removing all new lines (\n) and leading and trailing whitespace from all rows

In [721]:
df = df.applymap(lambda r: r.strip() if type(r) == str else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if type(r) == str else r)

This will get all the rows in ISBN that start with a letter:

In [722]:
# df[df['isbn'].str[0].str.isalpha()]


In [723]:
# df[df['description'].str[:4]== 'ISBN']

In [724]:
df.head(20)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09,False
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38,False
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1,False
5,19063.The_Book_Thief,The Book Thief,,Markus Zusak (Goodreads Author),4.37,Librarian's note: An alternate cover edition c...,English,9780375831003,"['Historical Fiction', 'Fiction', 'Young Adult...","['Liesel Meminger', 'Hans Hubermann', 'Rudy St...",...,['National Jewish Book Award for Children’s an...,1834276,"['1048230', '524674', '186297', '48864', '26211']",96.0,"['Molching (Germany)', 'Germany']",https://i.gr-assets.com/images/S/compressed.ph...,1372809,14168,3.8,False
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['Prometheus Hall of Fame Award (2011)', 'Retr...",2740713,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False
8,30.J_R_R_Tolkien_4_Book_Boxed_Set,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,The Lord of the Rings #0-3,J.R.R. Tolkien,4.6,"This four-volume, boxed set contains J.R.R. To...",English,9780345538376,"['Fantasy', 'Fiction', 'Classics', 'Adventure'...","['Frodo Baggins', 'Gandalf', 'Bilbo Baggins', ...",...,[],110146,"['78217', '22857', '6628', '1477', '967']",98.0,['Middle-earth'],https://i.gr-assets.com/images/S/compressed.ph...,1159802,12111,21.15,False
9,18405.Gone_with_the_Wind,Gone with the Wind,,Margaret Mitchell,4.3,"Scarlett O'Hara, the beautiful, spoiled daught...",English,9780446675536,"['Classics', 'Historical Fiction', 'Fiction', ...","[""Scarlett O'Hara"", 'Rhett Butler', 'Ashley Wi...",...,"['Pulitzer Prize for Novel (1937)', 'National ...",1074620,"['602138', '275517', '133535', '39008', '24422']",94.0,"['Atlanta, Georgia (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,1087732,11211,5.58,False
12,370493.The_Giving_Tree,The Giving Tree,,Shel Silverstein,4.37,"""Once there was a tree...and she loved a littl...",English,9780060256654,"['Childrens', 'Picture Books', 'Classics', 'Fi...",[],...,[],905731,"['556142', '204347', '94819', '27950', '22473']",94.0,[],https://i.gr-assets.com/images/S/compressed.ph...,1021534,10594,4.87,False
13,6185.Wuthering_Heights,Wuthering Heights,,"Emily Brontë, Richard J. Dunn (Editor), David ...",3.86,You can find the redesigned cover of this edit...,English,9780393978896,"['Classics', 'Fiction', 'Romance', 'Gothic', '...","['Heathcliff', 'Catherine Earnshaw', 'Edgar Li...",...,[],1342664,"['483222', '413051', '279702', '107437', '59252']",88.0,"['Yorkshire Dales, England']",https://i.gr-assets.com/images/S/compressed.ph...,981502,10275,2.73,False
15,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,,Arthur Golden,4.12,"A literary sensation and runaway bestseller, t...",English,9781400096893,"['Fiction', 'Historical Fiction', 'Romance', '...","['Sayuri Nitta', 'Mr. Bekku', 'Ichiro Tanaka',...",...,[],1717312,"['712950', '615702', '295184', '66518', '26958']",95.0,"['Kyoto (Japan)', 'Japan']",https://i.gr-assets.com/images/S/compressed.ph...,862748,8993,2.76,False


Filling ISBN field using ISBN from description field, if the ISBN field is 9999999999 or null:

Eventually rewrite this with error handling that will catch if wrong data type (non-str) is passed to the function

In [725]:
def extract_isbn(df):
    """
    Extract and move ISBN from the description to the ISBN field if the ISBN is '9999999999999' or null.

    The function applies a regex pattern to identify ISBNs from the description field and 
    moves them to the ISBN field.

    :param df: A DataFrame containing 'isbn' and 'description' columns
    :return: The modified DataFrame
    :raises ValueError: If 'isbn' or 'description' columns are missing
    """

    if 'isbn' not in df.columns:
        raise ValueError("'isbn' column is missing in the DataFrame.")
    if 'description' not in df.columns:
        raise ValueError("'description' column is missing in the DataFrame.")

    # First regex pattern matches 13 digits optionally preceded and followed by a non-digit character
    # Second regex pattern matches 10 digits optionally preceded and followed by a non-digit character
    # Third regex pattern matches "B" and a specific pattern of 4 digits, 3 uppercase letters, 1 digit, and 1 uppercase letter: B1234XYZ7A
    # Fourth regex matches the pattern "978-", followed by one digit, and then a sequence of either digits or dashes that is at least 9 and at most 13 characters long: 978-3-16-148410-0

    isbn_pattern = (r'((?:\D)?(\d{13})(?:\D)?|(?:\D)?(\d{10})(?:\D)?|'
                    r'B\d{4}[A-Z]{3}\d{1}[A-Z]|978-\d[-\d]{9,13})')

    mask = (df['isbn'] == '9999999999999') | pd.isnull(df['isbn'])
    descriptions = df.loc[mask, 'description'].astype(str)

    extracted_isbns = descriptions.str.extract(isbn_pattern)[0].str.replace('-', '')  # Extract ISBNs and remove dashes
    mask_first_char = ~extracted_isbns.str[0].str.isdigit()
    extracted_isbns = extracted_isbns.str[1:].where(mask_first_char, extracted_isbns)

    mask_last_char = ~extracted_isbns.str[-1].str.isdigit()  # Remove non-digit last character if exists
    extracted_isbns = extracted_isbns.str[:-1].where(mask_last_char, extracted_isbns)

    df.loc[mask, 'isbn'] = extracted_isbns

    return df

df = extract_isbn(df)

In [726]:
filtered_df = df[(df['isbn'] != '9999999999999') & ~df['isbn'].isnull()]
filtered_df.shape            #first func

(48064, 26)

In [727]:
filtered_df = df[(df['isbn'] != '9999999999999') & ~df['isbn'].isnull()]
filtered_df.shape            #opt  func

(48064, 26)

In [728]:
df.head(20) # optimized

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09,False
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38,False
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1,False
5,19063.The_Book_Thief,The Book Thief,,Markus Zusak (Goodreads Author),4.37,Librarian's note: An alternate cover edition c...,English,9780375831003,"['Historical Fiction', 'Fiction', 'Young Adult...","['Liesel Meminger', 'Hans Hubermann', 'Rudy St...",...,['National Jewish Book Award for Children’s an...,1834276,"['1048230', '524674', '186297', '48864', '26211']",96.0,"['Molching (Germany)', 'Germany']",https://i.gr-assets.com/images/S/compressed.ph...,1372809,14168,3.8,False
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['Prometheus Hall of Fame Award (2011)', 'Retr...",2740713,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False
8,30.J_R_R_Tolkien_4_Book_Boxed_Set,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,The Lord of the Rings #0-3,J.R.R. Tolkien,4.6,"This four-volume, boxed set contains J.R.R. To...",English,9780345538376,"['Fantasy', 'Fiction', 'Classics', 'Adventure'...","['Frodo Baggins', 'Gandalf', 'Bilbo Baggins', ...",...,[],110146,"['78217', '22857', '6628', '1477', '967']",98.0,['Middle-earth'],https://i.gr-assets.com/images/S/compressed.ph...,1159802,12111,21.15,False
9,18405.Gone_with_the_Wind,Gone with the Wind,,Margaret Mitchell,4.3,"Scarlett O'Hara, the beautiful, spoiled daught...",English,9780446675536,"['Classics', 'Historical Fiction', 'Fiction', ...","[""Scarlett O'Hara"", 'Rhett Butler', 'Ashley Wi...",...,"['Pulitzer Prize for Novel (1937)', 'National ...",1074620,"['602138', '275517', '133535', '39008', '24422']",94.0,"['Atlanta, Georgia (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,1087732,11211,5.58,False
12,370493.The_Giving_Tree,The Giving Tree,,Shel Silverstein,4.37,"""Once there was a tree...and she loved a littl...",English,9780060256654,"['Childrens', 'Picture Books', 'Classics', 'Fi...",[],...,[],905731,"['556142', '204347', '94819', '27950', '22473']",94.0,[],https://i.gr-assets.com/images/S/compressed.ph...,1021534,10594,4.87,False
13,6185.Wuthering_Heights,Wuthering Heights,,"Emily Brontë, Richard J. Dunn (Editor), David ...",3.86,You can find the redesigned cover of this edit...,English,9780393978896,"['Classics', 'Fiction', 'Romance', 'Gothic', '...","['Heathcliff', 'Catherine Earnshaw', 'Edgar Li...",...,[],1342664,"['483222', '413051', '279702', '107437', '59252']",88.0,"['Yorkshire Dales, England']",https://i.gr-assets.com/images/S/compressed.ph...,981502,10275,2.73,False
15,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,,Arthur Golden,4.12,"A literary sensation and runaway bestseller, t...",English,9781400096893,"['Fiction', 'Historical Fiction', 'Romance', '...","['Sayuri Nitta', 'Mr. Bekku', 'Ichiro Tanaka',...",...,[],1717312,"['712950', '615702', '295184', '66518', '26958']",95.0,"['Kyoto (Japan)', 'Japan']",https://i.gr-assets.com/images/S/compressed.ph...,862748,8993,2.76,False


Splitting out the int after the title in series into a separate series_num column: Adds to new series_num column, removes number from series column
We're not making series_num an int, bc some cases have a range of nums, e.g. 1-3

In [729]:
# adding the series_num column: 
df['series_num'] = None

In [730]:
def extract_series_num(df):
    """
    Extract the series number from the 'series' column in a DataFrame.

    The function looks for the pattern following a '#' character in the 'series' column
    and extracts the series number into a new column named 'series_num'.

    Parameters:
    df (pd.DataFrame): DataFrame containing the 'series' column to extract from.

    Returns:
    pd.DataFrame: DataFrame with the added 'series_num' column.

    Raises:
    ValueError: If 'series' column is not found in the DataFrame.
    """

    if 'series' not in df.columns:
        raise ValueError("'series' column missing from the DataFrame.")

    # This regex matches the character that comes after the '#' in the series column
    series_int_pattern = r'(#.*)'
    try:
        serieses = df['series'].astype(str)
        extracted_series_num = serieses.str.extract(series_int_pattern)[0]
        extracted_series_num = extracted_series_num.str.replace('#', '')
        df['series_num'] = extracted_series_num
    except Exception as e:
        raise ValueError("An error occurred while extracting series number.") from e

    return df

df = extract_series_num(df)



In [731]:
df['series_num'].head(20)

0       1
1       5
4       1
5     NaN
6     NaN
8     0-3
9     NaN
12    NaN
13    NaN
15    NaN
17    1-2
18    NaN
20    NaN
21      1
22    NaN
23    NaN
24    NaN
25    NaN
28      1
29      1
Name: series_num, dtype: object

In [732]:
print(df.bookId.shape)
df.bookId.unique().shape

(48064,)


(48064,)

In [733]:
# df['bookFormat'].value_counts()

Validation for dates - checking for cases where publish date is earlier than first publish date. 

First you need to figure out what the actual format is. Then remove all non-dates. Then set the data type. Then check for cases where publish date is earlier than first publish date.

In [734]:

df[['publishDate', 'firstPublishDate']] = df[['publishDate', 'firstPublishDate']].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y', errors='coerce')).apply(lambda x: x.dt.normalize())

In [735]:
def fix_dates(df):
    """
    Correct the 'firstPublishDate' and 'publishDate' in the DataFrame by:
    - Subtracting 100 years from 'firstPublishDate' if greater than today's date.
    - Subtracting 100 years from 'publishDate' if greater than today's date.
    - Subtracting 100 years from 'firstPublishDate' if greater than 'publishDate'.

    :param df: DataFrame containing the 'firstPublishDate' and 'publishDate' columns.
    :type df: pd.DataFrame
    :return: None
    :raises ValueError: If 'firstPublishDate' or 'publishDate' columns are not in DataFrame.
    """
    if 'firstPublishDate' not in df.columns or 'publishDate' not in df.columns:
        raise ValueError("'firstPublishDate' or 'publishDate' columns missing from the DataFrame.")
        
    # Identify rows where firstPublishDate is greater than today and subtract 100 years
    mask_first_publish_date = df['firstPublishDate'] > pd.Timestamp.today()
    df.loc[mask_first_publish_date, 'firstPublishDate'] -= DateOffset(years=100)

    # Identify rows where publishDate is greater than today and subtract 100 years
    mask_publish_date = df['publishDate'] > pd.Timestamp.today()
    df.loc[mask_publish_date, 'publishDate'] -= DateOffset(years=100)

    # Identify rows where firstPublishDate is greater than publishDate and subtract 100 years
    mask_first_publish_vs_publish = df['firstPublishDate'] > df['publishDate']
    df.loc[mask_first_publish_vs_publish, 'firstPublishDate'] -= DateOffset(years=100)

fix_dates(df)


In [736]:
df[['publishDate', 'firstPublishDate']].head(10)

Unnamed: 0,publishDate,firstPublishDate
0,2008-09-14,NaT
1,2004-09-28,2003-06-21
4,2006-09-06,2005-10-05
5,2006-03-14,2005-09-01
6,1996-04-28,1945-08-17
8,2012-09-25,1955-10-20
9,1999-04-01,1936-06-30
12,1964-10-07,1864-10-28
13,2002-10-28,1947-12-28
15,2005-11-22,1997-09-23


In [737]:
# df['publishDate'].dtypes

In [738]:
# df.shape

In [739]:
# rows_with_nulls = df[df.isnull().sum(axis=1) >= 13]
# print(rows_with_nulls)

Strip whitespace one more time:

In [740]:

df = df.applymap(lambda r: r.strip() if type(r) == str else r)

Adding new edition_id column to df, bc we will drop bookId column:

In [741]:
df['edition_id'] = df.index

In [742]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn', 'series_num',
       'edition_id'],
      dtype='object')

In [743]:
df[df['title'] == 'Animal Farm']

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn,series_num,edition_id
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False,,6
47085,253305.Animal_Farm,Animal Farm,,"Ian Wooldridge (Adapted by), George Orwell",4.07,George Orwell’s 1945 satire on the perils of S...,English,9781854597892,"['Fiction', 'Classics', 'Dystopia', 'Classic L...",[],...,"['207', '167', '75', '23', '17']",92.0,[],https://i.gr-assets.com/images/S/compressed.ph...,58,1,2.49,False,,47085


In [744]:
duplicate_rows = df[df.duplicated('title', keep=False)]
duplicate_rows

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn,series_num,edition_id
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.60,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1,False,1,4
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",...,"['986764', '958699', '545475', '165093', '84682']",91.0,"['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,1276599,13264,4.42,False,,6
18,10210.Jane_Eyre,Jane Eyre,,"Charlotte Brontë, Michael Mason (Editor), Barn...",4.13,"Orphaned as a child, Jane has felt an outcast ...",English,9780142437209,"['Classics', 'Fiction', 'Romance', 'Historical...","['Jane Eyre', 'Bertha Mason', 'Grace Poole', '...",...,"['746070', '501478', '256972', '76970', '43192']",93.0,"['Thornfield', 'England']",https://i.gr-assets.com/images/S/compressed.ph...,826759,8740,5.46,False,,18
24,18144590-the-alchemist,The Alchemist,,"Paulo Coelho (Goodreads Author), Alan R. Clark...",3.88,Paulo Coelho's enchanting novel has inspired a...,English,9780062315007,"['Fiction', 'Classics', 'Fantasy', 'Philosophy...","['Santiago', 'Alchemist', 'Melchizedek']",...,"['805221', '613564', '414410', '172160', '1023...",87.0,"['Egypt', 'Sahara Desert', 'Tangier (Morocco)'...",https://i.gr-assets.com/images/S/compressed.ph...,765587,8008,13.22,False,,24
28,256683.City_of_Bones,City of Bones,The Mortal Instruments #1,Cassandra Clare (Goodreads Author),4.10,When fifteen-year-old Clary Fray heads out to ...,English,9781416914280,"['Fantasy', 'Young Adult', 'Paranormal', 'Roma...","['Alexander ""Alec"" Lightwood', 'Jonathan ""Jace...",...,"['745774', '465337', '271968', '87611', '46775']",92.0,"['New York City, New York (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,721821,7601,6.29,False,1,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52471,12464361-elemental,Elemental,Soul Guardians #2,Kim Richardson (Goodreads Author),4.07,When seventeen-year-old Kara Nightingale is su...,English,B005IGOSCC,"['Fantasy', 'Young Adult', 'Angels', 'Romance'...",[],...,"['801', '636', '391', '84', '35']",94.0,[],https://i.gr-assets.com/images/S/compressed.ph...,0,1,,False,2,52471
52472,12395883-unbelievable,Unbelievable,Port Fare #2,Sherry Gammon (Goodreads Author),4.16,Lilah Lopez Dreser's in town to take care of u...,English,9781477594247,"['Romance', 'Young Adult', 'Contemporary', 'Co...",[],...,"['442', '384', '142', '48', '12']",94.0,"['Port Fare, New York (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,0,1,19.18,False,2,52472
52473,11492014-fractured,Fractured,Fateful #2,Cheri Schmidt (Goodreads Author),4.00,The Fateful Trilogy continues with Fractured. ...,English,2940012616562,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",[],...,"['311', '310', '197', '42', '11']",94.0,[],https://i.gr-assets.com/images/S/compressed.ph...,0,1,,False,2,52473
52475,10815662-marked,Marked,Soul Guardians #1,Kim Richardson (Goodreads Author),3.70,--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,English,9781461017097,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",[],...,"['2109', '1868', '1660', '647', '390']",84.0,[],https://i.gr-assets.com/images/S/compressed.ph...,0,1,7.37,False,1,52475


Creating edition df:

In [745]:
edition_df = df[['title', 'isbn', 'rating', 'numRatings', 'likedPercent', 'bbeScore', 'bbeVotes', 'bookFormat', 'edition', 'publishDate', 'firstPublishDate', 'description', 'series', 'series_num', 'language', 'pages', 'publisher', 'coverImg', 'price', 'edition_id', 'is_duplicate_isbn']].copy()

In [746]:
# edition_df.head()

Changing all fields in df that contain a string that looks like a list literals into actual lists:

In [747]:
cols = ['genres', 'characters', 'awards', 'ratingsByStars', 'setting']
for col in cols:
    df[col] = df[col].apply(ast.literal_eval)


In [748]:
df["genres"].head()

0    [Young Adult, Fiction, Dystopia, Fantasy, Scie...
1    [Fantasy, Young Adult, Fiction, Magic, Childre...
4    [Young Adult, Fantasy, Romance, Vampires, Fict...
5    [Historical Fiction, Fiction, Young Adult, His...
6    [Classics, Fiction, Dystopia, Fantasy, Literat...
Name: genres, dtype: object

Creating genre df, stripping whitespace, then filling columns

In [749]:
def create_genre_df(df):
    genre_df = df[['edition_id', 'genres']].copy()
    genre_df['genres'] = genre_df['genres'].apply(lambda genres: [genre.strip() for genre in genres])
    genre_df = genre_df.explode('genres')
    genre_df.rename(columns={'genres': 'genre'}, inplace=True)
    return genre_df

genre_df = create_genre_df(df)

In [750]:
genre_df.head(20)

Unnamed: 0,edition_id,genre
0,0,Young Adult
0,0,Fiction
0,0,Dystopia
0,0,Fantasy
0,0,Science Fiction
0,0,Romance
0,0,Adventure
0,0,Teen
0,0,Post Apocalyptic
0,0,Action


Creating character df and stripping whitespace:

In [751]:
def create_char_df(df):
    char_df = df[['edition_id', 'characters']].copy()
    char_df['characters'] = char_df['characters'].apply(lambda characters: [character.strip() for character in characters])
    char_df = char_df.explode('characters')
    char_df.rename(columns={'characters': 'char_name'}, inplace=True)
    return char_df
    
char_df = create_char_df(df)

In [752]:
char_df.head(20)

Unnamed: 0,edition_id,char_name
0,0,Katniss Everdeen
0,0,Peeta Mellark
0,0,Cato (Hunger Games)
0,0,Primrose Everdeen
0,0,Gale Hawthorne
0,0,Effie Trinket
0,0,Haymitch Abernathy
0,0,Cinna
0,0,President Coriolanus Snow
0,0,Rue


Creating setting df and stripping whitespace:

In [None]:
def create_setting_df(df):
    settings, edition_ids = zip(*[(setting, row['edition_id']) for _, row in df.iterrows() for setting in row['setting']])
    setting_df = pd.DataFrame({
        'edition_id': edition_ids,
        'setting': settings
    })
    setting_df = setting_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return setting_df

setting_df = create_setting_df(df)

In [None]:
setting_df.head(20)

Creating star rating df and stripping whitespace: 
In this df, I have sliced off data where we are missing 1 or more fields from ratingsByStars, because I see no way to know which star rating the missing data belongs to.

In [None]:
def create_star_rating_df(df):
    five_star, four_star, three_star, two_star, one_star, edition_id = zip(*[(row['ratingsByStars'][0], row['ratingsByStars'][1], row['ratingsByStars'][2], row['ratingsByStars'][3], row['ratingsByStars'][4], row['edition_id']) for _, row in df.iterrows() if len(row['ratingsByStars']) == 5])
    star_rating_df = pd.DataFrame({
        'five_star': five_star,
        'four_star': four_star,
        'three_star': three_star,
        'two_star': two_star,
        'one_star': one_star,
        'edition_id': edition_id
    })
    star_rating_df = star_rating_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return star_rating_df

star_rating_df = create_star_rating_df(df)

In [None]:
star_rating_df.head(10)

In [None]:
# # Duplicated star ratings:
# duplicated_star_ratings = star_rating_df[star_rating_df.duplicated(keep=False)]
# duplicated_star_ratings


Creating award df:

In [None]:
def create_award_df(df):
    awards, edition_ids = zip(*[(award, row['edition_id']) for _, row in df.iterrows() for award in row['awards']])
    award_df = pd.DataFrame({
        'award': awards,
        'edition_id': edition_ids
    })
    return award_df

award_df = create_award_df(df)

Splitting out year from award field and remove from award field and stripping whitespace::

In [None]:
def split_year(award_df):
    year_pattern = r'\((\d{4})\)'
    award_df['year'] = award_df['award'].str.extract(year_pattern)
    award_df['award'] = award_df['award'].str.replace(year_pattern, '', regex=True)
    award_df = award_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return award_df

award_df = split_year(award_df)


In [None]:
award_df.head(10)

Creating creator df:

In [None]:
def create_creator_df(df):
    creators, edition_ids = zip(*[(creator, row['edition_id']) for _, row in df.iterrows() for creator in row['author'].split(', ')])
    creator_df = pd.DataFrame({
        'creator': creators,
        'edition_id': edition_ids
    })
    return creator_df

creator_df = create_creator_df(df)

Splitting out role information into new column and stripping whitespace:

In [None]:
def split_role(creator_df):
    role_pattern = r'\((.*?)\)'
    creator_df['role'] = creator_df['creator'].str.extract(role_pattern)
    creator_df['creator'] = creator_df['creator'].str.replace(role_pattern, '', regex=True)
    creator_df = creator_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return creator_df

creator_df = split_role(creator_df)

In [None]:
creator_df.head(20)

In [None]:
%%sql