# Data Preparation

In [600]:
import os
import re
from datetime import date
import ast

import numpy as np
from scipy import stats
import pandas as pd
from pandas.tseries.offsets import *


In [601]:
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

Dropping duplicates:

In [602]:
print(df.shape)
print(df[df.duplicated()].shape)

(52478, 25)
(50, 25)


In [603]:
df = df.drop_duplicates()

In [604]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52428 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            52428 non-null  object 
 1   title             52428 non-null  object 
 2   series            23445 non-null  object 
 3   author            52428 non-null  object 
 4   rating            52428 non-null  float64
 5   description       51092 non-null  object 
 6   language          48627 non-null  object 
 7   isbn              52428 non-null  object 
 8   genres            52428 non-null  object 
 9   characters        52428 non-null  object 
 10  bookFormat        50955 non-null  object 
 11  edition           4949 non-null   object 
 12  pages             50085 non-null  object 
 13  publisher         48736 non-null  object 
 14  publishDate       51549 non-null  object 
 15  firstPublishDate  31125 non-null  object 
 16  awards            52428 non-null  object 
 17

In [605]:
df.describe(include='all')

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
count,52428,52428,23445,52428,52428.0,51092,48627,52428.0,52428,52428,...,31125,52428,52428.0,52428,51807.0,52428,51823,52428.0,52428.0,38084.0
unique,52424,49927,22802,28227,,50888,81,48072.0,44154,12448,...,8024,9215,,49908,,4651,51819,,,3776.0
top,24903989-widz-ci,Broken,Star Wars Legends,Nora Roberts (Goodreads Author),,"هذه هي طبعة ""دار الفكر - بيروت"" وهي آخر طبعة ع...",English,9999999999999.0,[],[],...,01/01/12,[],,[],,[],https://i.gr-assets.com/images/S/compressed.ph...,,,5.2
freq,2,14,15,86,,37,42635,4350.0,4620,38668,...,224,41823,,1422,,40858,2,,,625.0
mean,,,,,4.021865,,,,,,...,,,17892.6,,92.231938,,,1985.832,22.549535,
std,,,,,0.367116,,,,,,...,,,103993.2,,5.988683,,,35169.85,369.333934,
min,,,,,0.0,,,,,,...,,,0.0,,0.0,,,0.0,-4.0,
25%,,,,,3.82,,,,,,...,,,341.0,,90.0,,,84.0,1.0,
50%,,,,,4.03,,,,,,...,,,2309.5,,94.0,,,97.0,1.0,
75%,,,,,4.23,,,,,,...,,,9389.0,,96.0,,,187.0,2.0,


Dropping rows with all nulls:

In [606]:
df = df.dropna(how='all')

Flagging rows with duplicate isbns that are not 9999999999999 placeholder values:

In [607]:
def flag_duplicate_isbns(df):
    """
    Flag duplicate ISBNs in the provided DataFrame, excluding None.

    The function looks for duplicate ISBNs in the provided DataFrame and flags
    them by adding a new column 'is_duplicate_isbn'. This column is set to True
    for rows with duplicate ISBNs (excluding None) and False otherwise.
    The function modifies the input DataFrame in place.

    Parameters:
    df (pd.DataFrame): The DataFrame containing an 'isbn' column.

    Returns:
    pd.DataFrame: Modified DataFrame with 'is_duplicate_isbn' column added.

    Raises:
    Prints a message: If 'isbn' column is missing in the DataFrame.
    """
    if 'isbn' not in df.columns:
        print("'isbn' column is missing in the DataFrame.")
    else:
        df['is_duplicate_isbn'] = False
        mask = (df['isbn'].notnull()) & df.duplicated(subset='isbn', keep=False)
        df.loc[mask, 'is_duplicate_isbn'] = True

    return df

df = flag_duplicate_isbns(df)


In [608]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn'],
      dtype='object')

Removing all new lines (\n) and leading and trailing whitespace from all rows

In [609]:
df = df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if isinstance(r, str) else r)

In [610]:
df.head(5)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,is_duplicate_isbn
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09,False
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38,False
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",...,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,2269402,23328,,True
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"['Classics', 'Fiction', 'Romance', 'Historical...","['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",...,[],2998241,"['1617567', '816659', '373311', '113934', '767...",94.0,"['United Kingdom', 'Derbyshire, England (Unite...",https://i.gr-assets.com/images/S/compressed.ph...,1983116,20452,,True
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1,False


Filling ISBN field using ISBN from description field, if the ISBN field is 9999999999 or null:

Eventually rewrite this with error handling that will catch if wrong data type (non-str) is passed to the function

In [611]:
def extract_isbn(df):
    """
    Extract and move ISBN from the description to the ISBN field if the ISBN is '9999999999999' or null.

    The function identifies valid ISBNs within the 'description' column using a regex pattern. If the ISBN
    in the 'isbn' column is '9999999999999' or null, the extracted ISBN from the 'description' column is 
    used to update the 'isbn' column. Multiple patterns of ISBNs are identified:

    Parameters:
    df (pd.DataFrame): A DataFrame containing 'isbn' and 'description' columns.

    Returns:
    pd.DataFrame: The modified DataFrame with ISBNs extracted from the 'description' and updated in the 'isbn' column.

    Raises:
    Prints a message: If either 'isbn' or 'description' columns are missing in the DataFrame.
    """
    
    if 'isbn' not in df.columns:
        print("'isbn' column is missing in the DataFrame.")
    if 'description' not in df.columns:
        print("'description' column is missing in the DataFrame.")
    else:
        # First regex pattern matches 13 digits optionally preceded and followed by a non-digit character
        # Second regex pattern matches 10 digits optionally preceded and followed by a non-digit character
        # Third regex pattern matches "B" and a specific pattern of 4 digits, 3 uppercase letters, 1 digit, and 1 uppercase letter: B1234XYZ7A
        # Fourth regex matches the pattern "978-", followed by one digit, and then a sequence of either digits or dashes that is at least 9 and at most 13 characters long: 978-3-16-148410-0

        isbn_pattern = (r'((?:\D)?(\d{13})(?:\D)?|(?:\D)?(\d{10})(?:\D)?|'
                        r'B\d{4}[A-Z]{3}\d{1}[A-Z]|978-\d[-\d]{9,13})')

        mask = (df['isbn'] == '9999999999999') | pd.isnull(df['isbn'])
        descriptions = df.loc[mask, 'description'].astype(str)

        extracted_isbns = descriptions.str.extract(isbn_pattern)[0].str.replace('-', '')  # Extract ISBNs and remove dashes
        mask_first_char = extracted_isbns.str[0].str.isdigit() == False
        extracted_isbns = extracted_isbns.str[1:].where(mask_first_char, extracted_isbns)

        mask_last_char = extracted_isbns.str[-1].str.isdigit() == False # Remove non-digit last character if exists
        extracted_isbns = extracted_isbns.str[:-1].where(mask_last_char, extracted_isbns)

        df.loc[mask, 'isbn'] = extracted_isbns

        return df

df = extract_isbn(df)

In [612]:
filtered_df = df[(df['isbn'] != '9999999999999') & ~df['isbn'].isnull()]
filtered_df.shape

(48221, 26)

Splitting out the int after the title in series into a separate series_num column: Adds to new series_num column, removes number from series column
We're not making series_num an int, bc some cases have a range of nums, e.g. 1-3

In [613]:
# adding the series_num column: 
df['series_num'] = None

In [614]:
def extract_series_num(df):
    """
    Extracts the series number from the 'series' column of a DataFrame.

    This function searches the 'series' column for numbers following the '#' character 
    to determine the series number. The extracted number is then stored in a new 'series_num' 
    column. Subsequently, the matched pattern is removed from the 'series' column, leaving 
    behind a cleaned-up series name without the series number.

    Parameters:
    df (pd.DataFrame): The DataFrame with a 'series' column to process.

    Returns:
    pd.DataFrame: A modified version of the input DataFrame, now containing a 'series_num' 
                  column and an updated 'series' column with series numbers removed.

    Raises:
    Prints a message: If 'series' column is missing from the DataFrame.
    """
    
    if 'series' not in df.columns:
        print("'series' column missing from the DataFrame.")
    else:
        # This regex matches the character that comes after the '#' in the series column and captures the following digits
        series_int_pattern = r'(#\d+)'
        serieses = df['series'].astype(str)
        extracted_series_num = serieses.str.extract(series_int_pattern)[0]
        extracted_series_num = extracted_series_num.str.replace('#', '')
        df['series_num'] = extracted_series_num
        df['series'] = df['series'].str.replace(series_int_pattern, '', regex=True).str.strip()

        return df

df = extract_series_num(df)

In [615]:
df[['series', 'series_num']].head(20)

Unnamed: 0,series,series_num
0,The Hunger Games,1.0
1,Harry Potter,5.0
2,To Kill a Mockingbird,
3,,
4,The Twilight Saga,1.0
5,,
6,,
7,The Chronicles of Narnia (Publication Order) –7,1.0
8,The Lord of the Rings -3,0.0
9,,


In [616]:
print(df.bookId.shape)
df.bookId.unique().shape

(52428,)


(52424,)

Validation for dates - checking for cases where publish date is earlier than first publish date. 

First you need to figure out what the actual format is. Then remove all non-dates. Then set the data type. Then check for cases where publish date is earlier than first publish date.

In [617]:

df[['publishDate', 'firstPublishDate']] = df[['publishDate', 'firstPublishDate']].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y', errors='coerce')).apply(lambda x: x.dt.normalize())

In [618]:
def fix_dates(df):
    """
    Correct the 'firstPublishDate' and 'publishDate' in the DataFrame.
    
    The function performs the following corrections:
    - Subtracting 100 years from 'firstPublishDate' if greater than today's date.
    - Subtracting 100 years from 'publishDate' if greater than today's date.
    - Subtracting 100 years from 'firstPublishDate' if greater than 'publishDate'.

    Parameters:
    df (pd.DataFrame): DataFrame containing the 'firstPublishDate' and 'publishDate' columns.

    Returns:
    None

    Raises:
    Prints message: If 'firstPublishDate' or 'publishDate' columns are not in the DataFrame.
    """
    

    if 'firstPublishDate' not in df.columns:
        print("'firstPublishDate' column missing from the DataFrame.")
    
    if 'publishDate' not in df.columns:
        print("'publishDate' column missing from the DataFrame.")
    
    else:
        # Identify rows where firstPublishDate is greater than today and subtract 100 years
        mask_first_publish_date = df['firstPublishDate'] > pd.Timestamp.today()
        df.loc[mask_first_publish_date, 'firstPublishDate'] -= DateOffset(years=100)

        # Identify rows where publishDate is greater than today and subtract 100 years
        mask_publish_date = df['publishDate'] > pd.Timestamp.today()
        df.loc[mask_publish_date, 'publishDate'] -= DateOffset(years=100)

        # Identify rows where firstPublishDate is greater than publishDate and subtract 100 years
        mask_first_publish_vs_publish = df['firstPublishDate'] > df['publishDate']
        df.loc[mask_first_publish_vs_publish, 'firstPublishDate'] -= DateOffset(years=100)
    
fix_dates(df)

In [619]:
df[['publishDate', 'firstPublishDate']].head(10)

Unnamed: 0,publishDate,firstPublishDate
0,2008-09-14,NaT
1,2004-09-28,2003-06-21
2,2006-05-23,1960-07-11
3,2000-10-10,1913-01-28
4,2006-09-06,2005-10-05
5,2006-03-14,2005-09-01
6,1996-04-28,1945-08-17
7,2002-09-16,1956-10-28
8,2012-09-25,1955-10-20
9,1999-04-01,1936-06-30


Strip whitespace one more time:

In [620]:

df = df.applymap(lambda r: r.strip() if isinstance(r, str) else r)

Adding new edition_id column to df, bc we will drop bookId column:

In [621]:
df['edition_id'] = df.index

In [622]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn', 'series_num',
       'edition_id'],
      dtype='object')

## Creating dfs to be loaded to db tables

Creating edition df, and changing column names:

In [623]:
name_map = {
    'numRatings': 'num_ratings', 
    'likedPercent': 'liked_percent', 
    'bbeScore': 'bbe_score',
    'bbeVotes': 'bbe_votes',
    'bookFormat': 'format',
    'publishDate': 'publish_date',
    'coverImg': 'cover_url',
    'characters': 'char_name',
    'firstPublishDate': 'first_publish_date'
    }

In [624]:
edition_df = df[['title', 'isbn', 'rating', 'numRatings', 'likedPercent', 'bbeScore', 'bbeVotes', 'bookFormat', 'edition', 'series', 'series_num', 'publishDate', 'firstPublishDate', 'description', 'language', 'pages', 'publisher', 'coverImg', 'price', 'edition_id', 'is_duplicate_isbn']].copy()
edition_df.rename(columns=name_map, inplace=True)

In [625]:
edition_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52428 entries, 0 to 52477
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   title               52428 non-null  object        
 1   isbn                48221 non-null  object        
 2   rating              52428 non-null  float64       
 3   num_ratings         52428 non-null  int64         
 4   liked_percent       51807 non-null  float64       
 5   bbe_score           52428 non-null  int64         
 6   bbe_votes           52428 non-null  int64         
 7   format              50955 non-null  object        
 8   edition             4949 non-null   object        
 9   series              23445 non-null  object        
 10  series_num          22384 non-null  object        
 11  publish_date        818 non-null    datetime64[ns]
 12  first_publish_date  28753 non-null  datetime64[ns]
 13  description         51092 non-null  object        


In [626]:
edition_df.head()

Unnamed: 0,title,isbn,rating,num_ratings,liked_percent,bbe_score,bbe_votes,format,edition,series,...,publish_date,first_publish_date,description,language,pages,publisher,cover_url,price,edition_id,is_duplicate_isbn
0,The Hunger Games,9780439023481.0,4.33,6376780,96.0,2993816,30516,Hardcover,First Edition,The Hunger Games,...,2008-09-14,NaT,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,374,Scholastic Press,https://i.gr-assets.com/images/S/compressed.ph...,5.09,0,False
1,Harry Potter and the Order of the Phoenix,9780439358071.0,4.5,2507623,98.0,2632233,26923,Paperback,US Edition,Harry Potter,...,2004-09-28,2003-06-21,There is a door at the end of a silent corrido...,English,870,Scholastic Inc.,https://i.gr-assets.com/images/S/compressed.ph...,7.38,1,False
2,To Kill a Mockingbird,,4.28,4501075,95.0,2269402,23328,Paperback,,To Kill a Mockingbird,...,2006-05-23,1960-07-11,The unforgettable novel of a childhood in a sl...,English,324,Harper Perennial Modern Classics,https://i.gr-assets.com/images/S/compressed.ph...,,2,True
3,Pride and Prejudice,9780679783268.0,4.26,2998241,94.0,1983116,20452,Paperback,"Modern Library Classics, USA / CAN",,...,2000-10-10,1913-01-28,Alternate cover edition of ISBN 9780679783268S...,English,279,Modern Library,https://i.gr-assets.com/images/S/compressed.ph...,,3,True
4,Twilight,9780316015844.0,3.6,4964519,78.0,1459448,14874,Paperback,,The Twilight Saga,...,2006-09-06,2005-10-05,About three things I was absolutely positive.\...,English,501,"Little, Brown and Company",https://i.gr-assets.com/images/S/compressed.ph...,2.1,4,False


Changing all fields in df that contain a string that looks like a list literals into actual lists:

In [627]:
def apply_literal_eval(df, cols):
    """
    Apply literal evaluation to specified columns of a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to apply the transformation to.
    cols (list): A list of columns on which to apply ast.literal_eval.

    Returns:
    pd.DataFrame: DataFrame with the transformed columns.

    Raises:
    Prints message: If any column does not exist in the DataFrame or an error occurs while applying ast.literal_eval.
    """

    for col in cols:
        if col not in df.columns:
            print(f"Column '{col}' not found in DataFrame.")
        else:
            df[col] = df[col].apply(ast.literal_eval)

    return df

cols = ['genres', 'characters', 'awards', 'ratingsByStars', 'setting']

df = apply_literal_eval(df, cols)


Getting only integers in pages col:

In [628]:
def rm_text_from_pages(edition_df):
    """
    Remove non-digit characters from the 'pages' column of a DataFrame.

    The function looks for non-digit characters in the 'pages' column of the given DataFrame and replaces them with zeros. The original DataFrame is modified.

    Parameters:
    edition_df (pd.DataFrame): DataFrame containing the 'pages' column.

    Returns:
    pd.DataFrame: DataFrame with non-digit characters removed from the 'pages' column.

    Raises:
    Prints message: If edition_df does not contain the 'pages' column.
    """

    if 'pages' not in edition_df.columns:
        print("'pages' column must be present in the DataFrame")
    else:
        # Use regular expression to replace non-digit characters with NaN, and then fill NaN with a default value
        edition_df['pages'] = edition_df['pages'].astype(str).replace(r'\D', '', regex=True).replace('', np.nan).fillna(0).astype('int')

        return edition_df

edition_df = rm_text_from_pages(edition_df)

Formatting price column:

In [629]:
def reformat_price(edition_df):
    """
    Remove all occurrences of the period (.) in the 'price' column except for the last one.

    The function looks for occurrences of the period character in the 'price' column and removes all but the last occurrence.

    Parameters:
    edition_df (pd.DataFrame): DataFrame containing the 'price' column.

    Returns:
    pd.DataFrame: DataFrame with all but the last occurrence of the period removed from the 'price' column.

    Raises:
    Prints message: If edition_df does not contain the 'price' column.
    """

    if 'price' not in edition_df.columns:
        print("'price' column must be present in the DataFrame")
    else:
        # Define a function to remove all but the last occurrence of the period
        def remove_all_except_last(price):
            if pd.isnull(price):
                return price
            parts = str(price).split('.')
            new_price = f"{''.join(parts[:-1])}.{parts[-1]}" if len(parts) > 1 else price
            return new_price

        # Apply the transformation to the 'price' column
        edition_df['price'] = edition_df['price'].apply(remove_all_except_last)

        return edition_df

edition_df = reformat_price(edition_df)


In [630]:
edition_df['price']

0        5.09
1        7.38
2         NaN
3         NaN
4         2.1
         ... 
52473     NaN
52474     NaN
52475    7.37
52476    2.86
52477    5.20
Name: price, Length: 52428, dtype: object

In [631]:
df["genres"].head()

0    [Young Adult, Fiction, Dystopia, Fantasy, Scie...
1    [Fantasy, Young Adult, Fiction, Magic, Childre...
2    [Classics, Fiction, Historical Fiction, School...
3    [Classics, Fiction, Romance, Historical Fictio...
4    [Young Adult, Fantasy, Romance, Vampires, Fict...
Name: genres, dtype: object

Creating genre df, stripping whitespace, then filling columns

In [632]:
def create_genre_df(df):
    """
    Extract the individual genres from the 'genres' column of a given DataFrame.

    The function looks for the 'edition_id' and 'genres' columns in the given DataFrame. The 'genres' column is expected to contain lists of genres, and this function separates each genre into its own row, linking it with the corresponding 'edition_id'.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'genres' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'genre', where each genre is in its own row.

    Raises:
    Prints message: If the required columns are not found in the DataFrame.
    """

    if 'edition_id' not in df.columns or 'genres' not in df.columns:
        print("'edition_id' and 'genres' columns must be present in the DataFrame.")
    else:
        genre_df = df[['edition_id', 'genres']].copy()
        genre_df['genres'] = genre_df['genres'].apply(lambda genres: [genre.strip() for genre in genres])
        genre_df = genre_df.explode('genres')
        genre_df.dropna(subset=['genres'], inplace=True)
        genre_df.drop_duplicates(inplace=True)
        genre_df.rename(columns={'genres': 'genre'}, inplace=True)

        return genre_df


genre_df = create_genre_df(df)


In [633]:
genre_df.head(20)

Unnamed: 0,edition_id,genre
0,0,Young Adult
0,0,Fiction
0,0,Dystopia
0,0,Fantasy
0,0,Science Fiction
0,0,Romance
0,0,Adventure
0,0,Teen
0,0,Post Apocalyptic
0,0,Action


In [634]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn', 'series_num',
       'edition_id'],
      dtype='object')

In [635]:
df['characters'].head()

0    [Katniss Everdeen, Peeta Mellark, Cato (Hunger...
1    [Sirius Black, Draco Malfoy, Ron Weasley, Petu...
2    [Scout Finch, Atticus Finch, Jem Finch, Arthur...
3    [Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabe...
4    [Edward Cullen, Jacob Black, Laurent, Renee, B...
Name: characters, dtype: object

In [636]:
print(df['characters'].apply(type).value_counts())


characters
<class 'list'>    52428
Name: count, dtype: int64


Creating character df and stripping whitespace:

In [637]:
def create_char_df(df):
    """
    Extract the individual character names from the 'characters' column of a given DataFrame.

    The function looks for the 'edition_id' and 'characters' columns in the given DataFrame. The 'characters' column is expected to contain lists of character names, and this function separates each character into its own row, linking it with the corresponding 'edition_id'.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'characters' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'char_name', where each character name is in its own row.

    Raises:
    Prints message: If the required columns are not found in the DataFrame.
    """

    if 'characters' not in df.columns:
        print("'characters' column must be present in the DataFrame.")
    if 'edition_id' not in df.columns:
        print("'edition_id' column must be present in the DataFrame.")
    else:
        char_df = df[['edition_id', 'characters']].copy()
        char_df['characters'] = char_df['characters'].apply(lambda characters: [character.strip() for character in characters])
        char_df = char_df.explode('characters')
        char_df.dropna(subset=['characters'], inplace=True)
        char_df.drop_duplicates(inplace=True)
        char_df.rename(columns={'characters': 'char_name'}, inplace=True)
        return char_df

char_df = create_char_df(df)

In [638]:
char_df.head(20)

Unnamed: 0,edition_id,char_name
0,0,Katniss Everdeen
0,0,Peeta Mellark
0,0,Cato (Hunger Games)
0,0,Primrose Everdeen
0,0,Gale Hawthorne
0,0,Effie Trinket
0,0,Haymitch Abernathy
0,0,Cinna
0,0,President Coriolanus Snow
0,0,Rue


Creating setting df and stripping whitespace:

In [639]:
def create_setting_df(df):
    """
    Extract the individual settings from the 'setting' column of a given DataFrame.

    The function looks for the 'edition_id' and 'setting' columns in the given DataFrame. The 'setting' column is expected to contain lists of settings, and this function separates each setting into its own row, linking it with the corresponding 'edition_id'.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'setting' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'setting', where each setting is in its own row.

    Raises:
    Prints message: If the required columns 'edition_id' and 'setting' are not found in the DataFrame.
    """

    if 'edition_id' not in df.columns:
        print("'edition_id' column must be present in the DataFrame.")
    if 'setting' not in df.columns:
        print("'setting' column must be present in the DataFrame.")
    else:
        setting_df = df[['edition_id', 'setting']].copy()
        setting_df['setting'] = setting_df['setting'].apply(lambda settings: [setting.strip() for setting in settings])
        setting_df = setting_df.explode('setting')
        setting_df.dropna(subset=['setting'], inplace=True)
        setting_df.drop_duplicates(inplace=True)

        return setting_df

setting_df = create_setting_df(df)

In [640]:
setting_df.head(20)

Unnamed: 0,edition_id,setting
0,0,"District 12, Panem"
0,0,"Capitol, Panem"
0,0,Panem (United States)
1,1,Hogwarts School of Witchcraft and Wizardry (Un...
1,1,"London, England"
2,2,"Maycomb, Alabama (United States)"
3,3,United Kingdom
3,3,"Derbyshire, England (United Kingdom)"
3,3,England
3,3,"Hertfordshire, England (United Kingdom)"


Creating star rating df and stripping whitespace: 
In this df, I have sliced off data where we are missing 1 or more fields from ratingsByStars, because I see no way to know which star rating the missing data belongs to.

In [641]:
def create_star_rating_df(df):
    """
    Create a DataFrame by transforming the 'ratingsByStars' column of the given DataFrame.
    The 'ratingsByStars' column is assumed to contain lists of star ratings, and this function
    extracts these ratings into individual columns.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'ratingsByStars' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id', 'five_star', 'four_star', 'three_star', 'two_star', and 'one_star'.

    Raises:
    Prints message: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'ratingsByStars' not in df.columns:
        print("'ratingsByStars' column must be present in the DataFrame.")
    if 'edition_id' not in df.columns:
        print("'edition_id' column must be present in the DataFrame.")
    else:
        if df['ratingsByStars'].apply(type).eq(str).all():
            df['ratingsByStars'] = df['ratingsByStars'].apply(lambda r: ast.literal_eval(r))

        mask = df['ratingsByStars'].apply(len) == 5
        ratings_data = pd.DataFrame(df.loc[mask, 'ratingsByStars'].tolist(),
                                    columns=['five_star', 'four_star', 'three_star', 'two_star', 'one_star'])
        star_rating_df = ratings_data.assign(edition_id=df.loc[mask, 'edition_id'].values).reset_index(drop=True)
        star_rating_df = star_rating_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
        star_rating_df.drop_duplicates(inplace=True)

        return star_rating_df

star_rating_df = create_star_rating_df(df)


In [642]:
star_rating_df.head(10) 

Unnamed: 0,five_star,four_star,three_star,two_star,one_star,edition_id
0,3444695,1921313,745221,171994,93557,0
1,1593642,637516,222366,39573,14526,1
2,2363896,1333153,573280,149952,80794,2
3,1617567,816659,373311,113934,76770,3
4,1751460,1113682,1008686,542017,548674,4
5,1048230,524674,186297,48864,26211,5
6,986764,958699,545475,165093,84682,6
7,254964,167572,74362,15423,5419,7
8,78217,22857,6628,1477,967,8
9,602138,275517,133535,39008,24422,9


Creating award df:

In [643]:
def create_award_df(df):
    """
    Create a DataFrame by transforming the 'awards' column of the given DataFrame.
    The 'awards' column is assumed to contain lists of awards, and this function
    explodes these lists into individual rows.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'awards' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'award'.

    Raises:
    Prints message: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'awards' not in df.columns:
        print("'awards' column must be present in the DataFrame.")
    if 'edition_id' not in df.columns:
        print("'edition_id' column must be present in the DataFrame.")
    else:
        award_df = df[['edition_id', 'awards']].copy().where(df['awards'].notnull())
        award_df = award_df.explode('awards')
        award_df = award_df.rename(columns={'awards': 'award'})
        award_df.dropna(subset=['award'], inplace=True)
        award_df.drop_duplicates(inplace=True)

        return award_df

award_df = create_award_df(df)


In [644]:
award_df.head()

Unnamed: 0,edition_id,award
0,0,Locus Award Nominee for Best Young Adult Book ...
0,0,Georgia Peach Book Award (2009)
0,0,Buxtehuder Bulle (2009)
0,0,Golden Duck Award for Young Adult (Hal Clement...
0,0,Grand Prix de l'Imaginaire Nominee for Roman j...


Splitting out year from award field and remove from award field and stripping whitespace::

In [645]:
def split_year(award_df):
    """
    Extract the year from the 'award' column in the given DataFrame.

    The function looks for years enclosed in parentheses within the 'award' column and separates this information into a new 'award_year' column. The 'award' column is then updated to contain only the award name.

    Parameters:
    award_df (pd.DataFrame): The DataFrame containing the 'award' column.

    Returns:
    pd.DataFrame: A new DataFrame with 'award' and 'award_year' columns.

    Raises:
    Prints message: If the 'award' column is not found in the DataFrame or an error occurs while extracting the year.
    """
    
    if 'award' not in award_df.columns:
        print("'award' column must be present in the DataFrame.")
    else:
        year_pattern = r'\((\d{4})\)'
        award_df['award_year'] = award_df['award'].str.extract(year_pattern)
        award_df['award'] = award_df['award'].str.replace(year_pattern, '', regex=True)
        award_df = award_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)

        return award_df

award_df = split_year(award_df)


In [646]:
award_df.head(10)

Unnamed: 0,edition_id,award,award_year
0,0,Locus Award Nominee for Best Young Adult Book,2009
0,0,Georgia Peach Book Award,2009
0,0,Buxtehuder Bulle,2009
0,0,Golden Duck Award for Young Adult (Hal Clement...,2009
0,0,Grand Prix de l'Imaginaire Nominee for Roman j...,2010
0,0,Books I Loved Best Yearly (BILBY) Awards for O...,2012
0,0,West Australian Young Readers' Book Award (WAY...,2010
0,0,Red House Children's Book Award for Older Read...,2010
0,0,South Carolina Book Award for Junior and Young...,2011
0,0,Charlotte Award,2010


Creating creator df:

In [647]:
def create_creator_df(df):
    """
    Extract the creators from the 'author' column in the given DataFrame.

    The function looks for the 'author' column and splits each entry by comma, placing each creator into a new row under the 'creator' column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'author' columns.

    Returns:
    pd.DataFrame: A new DataFrame with 'edition_id' and 'creator' columns, where each creator is in its own row.

    Raises:
    Prints message: If the required 'edition_id' or 'author' columns are not found in the DataFrame.
    """


    if 'edition_id' not in df.columns:
        print("'edition_id' column must be present in the DataFrame.")
    if 'author' not in df.columns:
        print("'author' column must be present in the DataFrame.")
    else:
        creator_df = df[['edition_id', 'author']].copy()
        creator_df['author'] = creator_df['author'].apply(lambda author: author.split(', ') if isinstance(author, str) else author)
        creator_df = creator_df.explode('author')
        creator_df.dropna(subset=['author'], inplace=True)
        creator_df.rename(columns={'author': 'creator'}, inplace=True)
        creator_df.drop_duplicates(inplace=True)

        return creator_df

creator_df = create_creator_df(df)

In [648]:
creator_df.head(20)

Unnamed: 0,edition_id,creator
0,0,Suzanne Collins
1,1,J.K. Rowling
1,1,Mary GrandPré (Illustrator)
2,2,Harper Lee
3,3,Jane Austen
3,3,Anna Quindlen (Introduction)
4,4,Stephenie Meyer
5,5,Markus Zusak (Goodreads Author)
6,6,George Orwell
6,6,Russell Baker (Preface)


Splitting out role information into new column and stripping whitespace:

In [649]:
def split_role(creator_df):
    """
    Extract the creator's role from the 'creator' column in the given DataFrame.

    The function looks for the role within parentheses in the 'creator' column and splits it into 'creator_name' and 'role' columns. If no role is found, the 'role' column is filled with 'missing'.

    Parameters:
    creator_df (pd.DataFrame): The DataFrame containing the 'creator' column.

    Returns:
    pd.DataFrame: A new DataFrame with 'creator_name' and 'role' columns.

    Raises:
    Prints message: If the 'creator' column is not found in the DataFrame.
    """


    if 'creator' not in creator_df.columns:
        print("'creator' column must be present in the DataFrame.")
    else:
        role_pattern = r'\((.*?)\)'
        creator_df['role'] = creator_df['creator'].str.extract(role_pattern)
        creator_df['creator'] = creator_df['creator'].str.replace(role_pattern, '', regex=True)
        creator_df = creator_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
        creator_df.rename(columns={'creator': 'creator_name'}, inplace=True)
        creator_df['role'].fillna('missing', inplace=True)
        creator_df.drop_duplicates(inplace=True)

        return creator_df

creator_df = split_role(creator_df)

In [650]:
creator_df.head(20)

Unnamed: 0,edition_id,creator_name,role
0,0,Suzanne Collins,missing
1,1,J.K. Rowling,missing
1,1,Mary GrandPré,Illustrator
2,2,Harper Lee,missing
3,3,Jane Austen,missing
3,3,Anna Quindlen,Introduction
4,4,Stephenie Meyer,missing
5,5,Markus Zusak,Goodreads Author
6,6,George Orwell,missing
6,6,Russell Baker,Preface


## Sending dfs to parquet for use in other notebooks

In [651]:
edition_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/edition_df.parquet', index=False)
genre_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/genre_df.parquet', index=False)
char_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/char_df.parquet', index=False)
setting_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/setting_df.parquet', index=False)
star_rating_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/star_rating_df.parquet', index=False)
award_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/award_df.parquet', index=False)
creator_df.to_parquet('/Users/bfaris96/Desktop/turing-proj/books_db/data/creator_df.parquet', index=False)