Remember to get right in this project:
- Folder structure
- Docstrings
- PEP8
- Exception handling

In [425]:
# !python3 -m venv venv

In [426]:
# !pip install numpy
# !pip install pandas
# !pip install sqlalchemy
# !pip install ipython-sql
# !pip install python-dotenv
# !pip install psycopg2
# !pip install pandas-profiling
# !pip install ipywidgets


In [427]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv


load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [428]:
%sql postgresql://postgres:password@localhost/books

In [429]:
import regex as re
from pandas.tseries.offsets import *
from datetime import date
import ast

In [430]:
import pandas as pd
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

In [431]:
# df[df['description'].str[:4]== 'ISBN']

Dropping duplicates:

In [432]:
df = df.drop_duplicates()

Get rows where ISBN is duplicated and is not '999999999999' placeholder

In [433]:
# duplicated_isbns = df[(df['isbn']!= '9999999999999') & (df.duplicated(subset='isbn', keep=False))]
# duplicated_isbns

Dropping rows with all nulls:

In [438]:
df = df.dropna(how='all')

Flagging rows with duplicate isbns that are not 9999999999999 placeholder values:

In [445]:
def flag_duplicate_isbns(df):
    """
    Flag duplicate ISBNs in the DataFrame, excluding '9999999999999'.

    Adds a new column 'is_duplicate_isbn' to the DataFrame. The column is set
    to True for rows with duplicate ISBNs (excluding '9999999999999') and False
    otherwise.

    :param df: DataFrame with a column named 'isbn'
    :return: DataFrame with 'is_duplicate_isbn' column added
    :raises ValueError: If 'isbn' column is missing in the DataFrame
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")

        df['is_duplicate_isbn'] = False
        mask = (df['isbn'] != '9999999999999') & df.duplicated(subset='isbn', keep=False)
        df.loc[mask, 'is_duplicate_isbn'] = True
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return df

# Usage example:
try:
    df = flag_duplicate_isbns(df)
except ValueError as e:
    print(e)

In [446]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price', 'is_duplicate_isbn'],
      dtype='object')

In [378]:
def rm_duplicates(df):
    """
    Process a DataFrame by preserving rows with ISBN '9999999999999' 
    and removing duplicates in ISBN for other rows.

    :param df: Input DataFrame containing a column 'isbn'
    :return: Processed DataFrame with specified rows and no duplicates in ISBN
    :raises ValueError: If 'isbn' column is missing
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")
        result_df = df[(df['isbn'] == '9999999999999') | ~df.duplicated(subset='isbn', keep=False)]
        return result_df

    except Exception as e:
        print(f"No changes made. An error occurred: {e}.")
        return df

#Usage:
df = rm_duplicates(df)

Removing all new lines (\n) and leading and trailing whitespace from all rows

In [None]:
df = df.applymap(lambda r: r.strip() if type(r) == str else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if type(r) == str else r)

This will get all the rows in ISBN that start with a letter:

In [None]:
# df[df['isbn'].str[0].str.isalpha()]


In [None]:
# df[df['description'].str[:4]== 'ISBN']

Filling ISBN field using ISBN from description field, if the ISBN field is 9999999999 or null:

Eventually rewrite this with error handling that will catch if wrong data type (non-str) is passed to the function

In [None]:
def move_isbn(row):
    # First regex pattern matches 13 digits optionally preceded and followed by a non-digit character
    # Second regex pattern matches 10 digits optionally preceded and followed by a non-digit character
    # Third regex pattern matches "B" and a specific pattern of 4 digits, 3 uppercase letters, 1 digit, and 1 uppercase letter: B1234XYZ7A
    # Fourth regex matches the pattern "978-", followed by one digit, and then a sequence of either digits or dashes that is at least 9 and at most 13 characters long: 978-3-16-148410-0

    isbn_pattern = re.compile(r'((?:\D)?(\d{13})(?:\D)?|(?:\D)?(\d{10})(?:\D)?|B\d{4}[A-Z]{3}\d{1}[A-Z]|978-\d[-\d]{9,13})')
    
    if row['isbn'] == '9999999999999' or pd.isnull(row['isbn']):
        if isinstance(row['description'], str):
            isbn_match = re.search(isbn_pattern, row['description'])
            if isbn_match:
                match = isbn_match.group(1)
                if match.startswith('978-'):
                    match = match.replace('-', '')
                if match[0].isdigit() == False:
                    match = match[1:]
                if match[-1].isdigit() == False:
                    
                    match = match[:-1]
                row['isbn'] = match
    return row

# Usage:
df = df.apply(move_isbn, axis=1)

In [None]:
df.shape

Splitting out the int after the title in series into a separate series_num column: Adds to new series_num column, removes number from series column
We're not making series_num an int, bc some cases have a range of nums, e.g. 1-3

In [None]:
# adding the series_num column: 
df['series_num'] = None

In [None]:
def move_series_num(row):
    # This regex mathes the character that come after the '#' in the series column
    series_int_pattern = re.compile(r'(#.*)')
    if isinstance(row['series'], str):
        series_num_match = re.search(series_int_pattern, row['series'])
        if series_num_match:
            s_int = series_num_match.group().replace('#', '')
            row['series_num'] = s_int
            row['series'] = row['series'].replace(series_num_match.group(), '')
    return row

# Usage:
df = df.apply(move_series_num, axis=1)

In [None]:
print(df.bookId.shape)
df.bookId.unique().shape

In [None]:
# df['bookFormat'].value_counts()

Validation for dates - checking for cases where publish date is earlier than first publish date. 

First you need to figure out what the actual format is. Then remove all non-dates. Then set the data type. Then check for cases where publish date is earlier than first publish date.

In [None]:

df[['publishDate', 'firstPublishDate']] = df[['publishDate', 'firstPublishDate']].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y', errors='coerce'))


# df['publishDate'] = [pd.to_datetime(element, format='%m/%d/%y', errors='coerce') for element in df['publishDate']]
# df['firstPublishDate'] = [pd.to_datetime(element, format= '%m/%d/%y', errors='coerce') for element in df['firstPublishDate']]



In [None]:
date_columns = ['publishDate', 'firstPublishDate']
for col in date_columns:
    df[col] = pd.to_datetime(df[col]).dt.normalize()

In [None]:


def correct_date(row):
    if row["firstPublishDate"] > pd.Timestamp.today():
        row['firstPublishDate'] = row['firstPublishDate'] - DateOffset(years=100)
    if row['publishDate'] > pd.Timestamp.today():
        row['publishDate'] = row['publishDate'] - DateOffset(years=100)
    if row['firstPublishDate'] > row['publishDate']:
        row['firstPublishDate'] = row['firstPublishDate'] - DateOffset(years=100)
    return row

# Usage: 
df = df.apply(correct_date, axis=1)


In [None]:
# df.head(100)

In [None]:
# df['publishDate'].dtypes

In [None]:
# df.shape

In [None]:
# rows_with_nulls = df[df.isnull().sum(axis=1) >= 13]
# print(rows_with_nulls)

Strip whitespace one more time:

In [None]:

df = df.applymap(lambda r: r.strip() if type(r) == str else r)
# df = pd.DataFrame({col: [element.strip() if isinstance(element, str) else element for element in df[col]] for col in df.columns})


Adding new edition_id column to df, bc we will drop bookId column:

In [None]:
df['edition_id'] = df.index

In [None]:
df.columns

Creating edition df:

In [None]:
edition_df = df[['title', 'isbn', 'rating', 'numRatings', 'likedPercent', 'bbeScore', 'bbeVotes', 'bookFormat', 'edition', 'publishDate', 'firstPublishDate', 'description', 'series', 'series_num', 'language', 'pages', 'publisher', 'coverImg', 'price', 'edition_id', 'is_duplicate_isbn']].copy()

In [300]:
edition_df.head()

Unnamed: 0,title,isbn,rating,numRatings,likedPercent,bbeScore,bbeVotes,bookFormat,edition,publishDate,firstPublishDate,description,series,series_num,language,pages,publisher,coverImg,price,edition_id
2,To Kill a Mockingbird,9999999999999,4.28,4501075,95.0,2269402,23328,Paperback,,2006-05-23,1960-07-11,The unforgettable novel of a childhood in a sl...,To Kill a Mockingbird,,English,324,Harper Perennial Modern Classics,https://i.gr-assets.com/images/S/compressed.ph...,,2
3,Pride and Prejudice,9780679783268,4.26,2998241,94.0,1983116,20452,Paperback,"Modern Library Classics, USA / CAN",2000-10-10,1913-01-28,Alternate cover edition of ISBN 9780679783268S...,,,English,279,Modern Library,https://i.gr-assets.com/images/S/compressed.ph...,,3
7,The Chronicles of Narnia,9999999999999,4.26,517740,96.0,1238556,12949,Paperback,Reissue Edition,2002-09-16,1956-10-28,"Journeys to the end of the world, fantastic cr...",The Chronicles of Narnia (Publication Order),1–7,English,767,HarperCollins,https://i.gr-assets.com/images/S/compressed.ph...,,7
10,The Fault in Our Stars,9999999999999,4.21,3550714,93.0,1087056,11287,Hardcover,,2012-01-10,NaT,Despite the tumor-shrinking medical miracle th...,,,English,313,Dutton Books,https://i.gr-assets.com/images/S/compressed.ph...,,10
11,The Hitchhiker's Guide to the Galaxy,9999999999999,4.22,1436325,94.0,1063601,10996,Paperback,,2007-06-23,1979-10-12,Seconds before the Earth is demolished to make...,The Hitchhiker's Guide to the Galaxy,1,English,193,Del Rey,https://i.gr-assets.com/images/S/compressed.ph...,,11


Changing all fields in df that contain a string that looks like a list into actual lists:

In [None]:
cols = ['genres', 'characters', 'awards', 'ratingsByStars', 'setting']
converted_cols = {col: df[col].apply(ast.literal_eval) for col in cols}
df.update(pd.DataFrame(converted_cols))

Creating genre df, then populating it by splitting out genre and appending edition_id, then stripping whitespace:

In [312]:
def create_genre_df(df):
    genres, edition_ids = zip(*[(genre, row['edition_id']) for _, row, in df.iterrows() for genre in row['genres']])
    genre_df = pd.DataFrame({
        'edition_id': edition_ids,
        'genre': genres
    })
    genre_df = genre_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return genre_df
    
genre_df = create_genre_df(df)

In [313]:
genre_df.head(20)

Unnamed: 0,edition_id,genre
0,2,Classics
1,2,Fiction
2,2,Historical Fiction
3,2,School
4,2,Literature
5,2,Young Adult
6,2,Historical
7,2,Novels
8,2,Read For School
9,2,High School


Creating character df and stripping whitespace:

In [314]:
def create_char_df(df):
    char_names, edition_ids = zip(*[(character, row['edition_id']) for _, row in df.iterrows() for character in row['characters']])
    char_df = pd.DataFrame({
        'edition_id': edition_ids,
        'char_name': char_names
    })
    char_df = char_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return char_df

char_df = create_char_df(df)

In [315]:
char_df.head(20)

Unnamed: 0,edition_id,char_name
0,2,Scout Finch
1,2,Atticus Finch
2,2,Jem Finch
3,2,Arthur Radley
4,2,Mayella Ewell
5,2,Aunt Alexandra
6,2,Bob Ewell
7,2,Calpurnia (housekeeper)
8,2,Tom Robinson
9,2,Miss Maudie Atkinson


Creating setting df and stripping whitespace:

In [316]:
def create_setting_df(df):
    settings, edition_ids = zip(*[(setting, row['edition_id']) for _, row in df.iterrows() for setting in row['setting']])
    setting_df = pd.DataFrame({
        'edition_id': edition_ids,
        'setting': settings
    })
    setting_df = setting_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return setting_df

setting_df = create_setting_df(df)

In [317]:
setting_df.head(20)

Unnamed: 0,edition_id,setting
0,2,"Maycomb, Alabama (United States)"
1,3,United Kingdom
2,3,"Derbyshire, England (United Kingdom)"
3,3,England
4,3,"Hertfordshire, England (United Kingdom)"
5,7,"London, England"
6,10,"Indianapolis, Indiana (United States)"
7,10,Amsterdam (Netherlands)
8,14,Paris (France)
9,14,"London, England"


Creating star rating df and stripping whitespace: 
In this df, I have sliced off data where we are missing 1 or more fields from ratingsByStars, because I see no way to know which star rating the missing data belongs to.

In [319]:
def create_star_rating_df(df):
    five_star, four_star, three_star, two_star, one_star, edition_id = zip(*[(row['ratingsByStars'][0], row['ratingsByStars'][1], row['ratingsByStars'][2], row['ratingsByStars'][3], row['ratingsByStars'][4], row['edition_id']) for _, row in df.iterrows() if len(row['ratingsByStars']) == 5])
    star_rating_df = pd.DataFrame({
        'five_star': five_star,
        'four_star': four_star,
        'three_star': three_star,
        'two_star': two_star,
        'one_star': one_star,
        'edition_id': edition_id
    })
    star_rating_df = star_rating_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return star_rating_df

star_rating_df = create_star_rating_df(df)

In [320]:
star_rating_df.head(10)

Unnamed: 0,five_star,four_star,three_star,two_star,one_star,edition_id
0,2363896,1333153,573280,149952,80794,2
1,1617567,816659,373311,113934,76770,3
2,254964,167572,74362,15423,5419,7
3,1784471,1022406,512574,150365,80898,10
4,725771,420864,199846,58326,31518,11
5,645308,667657,399278,142103,79100,14
6,382985,350778,176188,41869,14376,16
7,332692,202012,100932,29673,17484,19
8,599617,392068,188162,50631,19677,26
9,1345866,1266424,798955,252115,112144,27


In [321]:
# # Duplicated star ratings:
# duplicated_star_ratings = star_rating_df[star_rating_df.duplicated(keep=False)]
# duplicated_star_ratings


Creating award df:

In [322]:
def create_award_df(df):
    awards, edition_ids = zip(*[(award, row['edition_id']) for _, row in df.iterrows() for award in row['awards']])
    award_df = pd.DataFrame({
        'award': awards,
        'edition_id': edition_ids
    })
    return award_df

award_df = create_award_df(df)

Splitting out year from award field and remove from award field and stripping whitespace::

In [323]:
def split_year(award_df):
    year_pattern = r'\((\d{4})\)'
    award_df['year'] = award_df['award'].str.extract(year_pattern)
    award_df['award'] = award_df['award'].str.replace(year_pattern, '', regex=True)
    award_df = award_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return award_df

award_df = split_year(award_df)


In [324]:
award_df.head(10)

Unnamed: 0,award,edition_id,year
0,Pulitzer Prize for Fiction,2,1961
1,Audie Award for Classic,2,2007
2,National Book Award Finalist for Fiction,2,1961
3,Alabama Author Award for Fiction,2,1961
4,Georgia Peach Book Award,10,2013
5,Buxtehuder Bulle,10,2012
6,Odyssey Award,10,2013
7,Audie Award for Teens,10,2013
8,West Australian Young Readers' Book Award (WAY...,10,2013
9,Pennsylvania Young Readers' Choice Award for Y...,10,2013


Creating creator df:

In [325]:
def create_creator_df(df):
    creators, edition_ids = zip(*[(creator, row['edition_id']) for _, row in df.iterrows() for creator in row['author'].split(', ')])
    creator_df = pd.DataFrame({
        'creator': creators,
        'edition_id': edition_ids
    })
    return creator_df

creator_df = create_creator_df(df)

Splitting out role information into new column and stripping whitespace:

In [326]:
def split_role(creator_df):
    role_pattern = r'\((.*?)\)'
    creator_df['role'] = creator_df['creator'].str.extract(role_pattern)
    creator_df['creator'] = creator_df['creator'].str.replace(role_pattern, '', regex=True)
    creator_df = creator_df.applymap(lambda r: r.strip() if type(r) == str else r)
    return creator_df

creator_df = split_role(creator_df)

In [329]:
creator_df.head(20)

Unnamed: 0,creator,edition_id,role
0,Harper Lee,2,
1,Jane Austen,3,
2,Anna Quindlen,3,Introduction
3,C.S. Lewis,7,
4,Pauline Baynes,7,Illustrator
5,John Green,10,Goodreads Author
6,Douglas Adams,11,
7,Dan Brown,14,Goodreads Author
8,Oscar Wilde,16,
9,Jeffrey Eugenides,16,Introduction


In [None]:
%%sql