Remember to get right in this project:
- Folder structure
- Docstrings
- PEP8
- Exception handling

In [None]:
# !python3 -m venv venv

In [None]:
# !pip install numpy
# !pip install pandas
#!pip install matplotlib
# !pip install sqlalchemy
# !pip install ipython-sql
# !pip install python-dotenv
# !pip install psycopg2



In [None]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv


load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

%load_ext sql
%sql postgresql://postgres:password@localhost/books

In [None]:
import regex as re
from pandas.tseries.offsets import *
from datetime import date
import ast
import numpy as np

In [None]:
import pandas as pd
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

In [None]:
# df[df['description'].str[:4]== 'ISBN']

This is a function to create summary statistics for *string fields* to show before and after a transform:

Dropping duplicates:

In [None]:
print(df.shape)
print(df[df.duplicated()].shape)

In [None]:
df = df.drop_duplicates()


In [None]:
df.info()

In [None]:
df.describe(include='all')

Get rows where ISBN is duplicated and is not '999999999999' placeholder

In [None]:
# duplicated_isbns = df[(df['isbn']!= '9999999999999') & (df.duplicated(subset='isbn', keep=False))]
# duplicated_isbns

Dropping rows with all nulls:

In [None]:
df = df.dropna(how='all')

Flagging rows with duplicate isbns that are not 9999999999999 placeholder values:

In [None]:
def flag_duplicate_isbns(df):
    """
    Flag duplicate ISBNs in the DataFrame, excluding None.

    Adds a new column 'is_duplicate_isbn' to the DataFrame. The column is set
    to True for rows with duplicate ISBNs (excluding None) and False
    otherwise.

    :param df: DataFrame with a column named 'isbn'
    :return: DataFrame with 'is_duplicate_isbn' column added
    :raises ValueError: If 'isbn' column is missing in the DataFrame
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")

        df['is_duplicate_isbn'] = False
        mask = (df['isbn'] != None) & df.duplicated(subset='isbn', keep=False)
        df.loc[mask, 'is_duplicate_isbn'] = True
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return df

# Usage example:
try:
    df = flag_duplicate_isbns(df)
except ValueError as e:
    print(e)

In [None]:
df.columns

In [None]:
def rm_duplicates(df):
    """
    Process a DataFrame by preserving rows with ISBN None 
    and removing duplicates in ISBN for other rows.

    :param df: Input DataFrame containing a column 'isbn'
    :return: Processed DataFrame with specified rows and no duplicates in ISBN
    :raises ValueError: If 'isbn' column is missing
    """
    try:
        if 'isbn' not in df.columns:
            raise ValueError("'isbn' column is missing in the DataFrame.")
        result_df = df[(df['isbn'] == None) | ~df.duplicated(subset='isbn', keep=False)]
        return result_df

    except Exception as e:
        print(f"No changes made. An error occurred: {e}.")
        return df

#Usage:
df = rm_duplicates(df)

Removing all new lines (\n) and leading and trailing whitespace from all rows

In [None]:
df = df.applymap(lambda r: r.strip() if type(r) == str else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if type(r) == str else r)

This will get all the rows in ISBN that start with a letter:

In [None]:
# df[df['isbn'].str[0].str.isalpha()]


In [None]:
# df[df['description'].str[:4]== 'ISBN']

In [None]:
df.head(20)

Filling ISBN field using ISBN from description field, if the ISBN field is 9999999999 or null:

Eventually rewrite this with error handling that will catch if wrong data type (non-str) is passed to the function

In [None]:
def extract_isbn(df):
    """
    Extract and move ISBN from the description to the ISBN field if the ISBN is '9999999999999' or null.

    The function applies a regex pattern to identify ISBNs from the description field and 
    moves them to the ISBN field.

    :param df: A DataFrame containing 'isbn' and 'description' columns
    :return: The modified DataFrame
    :raises ValueError: If 'isbn' or 'description' columns are missing
    """

    if 'isbn' not in df.columns:
        raise ValueError("'isbn' column is missing in the DataFrame.")
    if 'description' not in df.columns:
        raise ValueError("'description' column is missing in the DataFrame.")

    # First regex pattern matches 13 digits optionally preceded and followed by a non-digit character
    # Second regex pattern matches 10 digits optionally preceded and followed by a non-digit character
    # Third regex pattern matches "B" and a specific pattern of 4 digits, 3 uppercase letters, 1 digit, and 1 uppercase letter: B1234XYZ7A
    # Fourth regex matches the pattern "978-", followed by one digit, and then a sequence of either digits or dashes that is at least 9 and at most 13 characters long: 978-3-16-148410-0

    isbn_pattern = (r'((?:\D)?(\d{13})(?:\D)?|(?:\D)?(\d{10})(?:\D)?|'
                    r'B\d{4}[A-Z]{3}\d{1}[A-Z]|978-\d[-\d]{9,13})')

    mask = (df['isbn'] == '9999999999999') | pd.isnull(df['isbn'])
    descriptions = df.loc[mask, 'description'].astype(str)

    extracted_isbns = descriptions.str.extract(isbn_pattern)[0].str.replace('-', '')  # Extract ISBNs and remove dashes
    mask_first_char = ~extracted_isbns.str[0].str.isdigit()
    extracted_isbns = extracted_isbns.str[1:].where(mask_first_char, extracted_isbns)

    mask_last_char = ~extracted_isbns.str[-1].str.isdigit()  # Remove non-digit last character if exists
    extracted_isbns = extracted_isbns.str[:-1].where(mask_last_char, extracted_isbns)

    df.loc[mask, 'isbn'] = extracted_isbns

    return df

df = extract_isbn(df)

In [None]:
filtered_df = df[(df['isbn'] != '9999999999999') & ~df['isbn'].isnull()]
filtered_df.shape

Splitting out the int after the title in series into a separate series_num column: Adds to new series_num column, removes number from series column
We're not making series_num an int, bc some cases have a range of nums, e.g. 1-3

In [None]:
# adding the series_num column: 
df['series_num'] = None

In [None]:
def extract_series_num(df):
    """
    Extract the series number from the 'series' column in a DataFrame.

    The function looks for the pattern following a '#' character in the 'series' column,
    extracts the series number into a new column named 'series_num', and then removes
    the matched pattern from the 'series' column, along with any leading and trailing
    whitespace.

    Parameters:
    df (pd.DataFrame): DataFrame containing the 'series' column to extract from.

    Returns:
    pd.DataFrame: DataFrame with the added 'series_num' column and modified 'series' column.

    Raises:
    ValueError: If 'series' column is not found in the DataFrame or an error occurs while extracting the series number.
    """

    if 'series' not in df.columns:
        raise ValueError("'series' column missing from the DataFrame.")

    # This regex matches the character that comes after the '#' in the series column
    series_int_pattern = r'(#.*)'
    try:
        serieses = df['series'].astype(str)
        extracted_series_num = serieses.str.extract(series_int_pattern)[0]
        extracted_series_num = extracted_series_num.str.replace('#', '')
        df['series_num'] = extracted_series_num
        df['series'] = df['series'].str.replace(series_int_pattern, '', regex=True).str.strip()
    except Exception as e:
        raise ValueError("An error occurred while extracting series number.") from e

    return df

df = extract_series_num(df)


In [None]:
df[['series', 'series_num']].head(20)

In [None]:
print(df.bookId.shape)
df.bookId.unique().shape

In [None]:
# df['bookFormat'].value_counts()

Validation for dates - checking for cases where publish date is earlier than first publish date. 

First you need to figure out what the actual format is. Then remove all non-dates. Then set the data type. Then check for cases where publish date is earlier than first publish date.

In [None]:

df[['publishDate', 'firstPublishDate']] = df[['publishDate', 'firstPublishDate']].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y', errors='coerce')).apply(lambda x: x.dt.normalize())

In [None]:
def fix_dates(df):
    """
    Correct the 'firstPublishDate' and 'publishDate' in the DataFrame by:
    - Subtracting 100 years from 'firstPublishDate' if greater than today's date.
    - Subtracting 100 years from 'publishDate' if greater than today's date.
    - Subtracting 100 years from 'firstPublishDate' if greater than 'publishDate'.

    :param df: DataFrame containing the 'firstPublishDate' and 'publishDate' columns.
    :type df: pd.DataFrame
    :return: None
    :raises ValueError: If 'firstPublishDate' or 'publishDate' columns are not in DataFrame.
    """
    if 'firstPublishDate' not in df.columns or 'publishDate' not in df.columns:
        raise ValueError("'firstPublishDate' or 'publishDate' columns missing from the DataFrame.")
        
    # Identify rows where firstPublishDate is greater than today and subtract 100 years
    mask_first_publish_date = df['firstPublishDate'] > pd.Timestamp.today()
    df.loc[mask_first_publish_date, 'firstPublishDate'] -= DateOffset(years=100)

    # Identify rows where publishDate is greater than today and subtract 100 years
    mask_publish_date = df['publishDate'] > pd.Timestamp.today()
    df.loc[mask_publish_date, 'publishDate'] -= DateOffset(years=100)

    # Identify rows where firstPublishDate is greater than publishDate and subtract 100 years
    mask_first_publish_vs_publish = df['firstPublishDate'] > df['publishDate']
    df.loc[mask_first_publish_vs_publish, 'firstPublishDate'] -= DateOffset(years=100)

fix_dates(df)


In [None]:
df[['publishDate', 'firstPublishDate']].head(10)

In [None]:
# df['publishDate'].dtypes

In [None]:
# df.shape

In [None]:
# rows_with_nulls = df[df.isnull().sum(axis=1) >= 13]
# print(rows_with_nulls)

Strip whitespace one more time:

In [None]:

df = df.applymap(lambda r: r.strip() if type(r) == str else r)

Adding new edition_id column to df, bc we will drop bookId column:

In [None]:
df['edition_id'] = df.index

In [None]:
df.columns

In [None]:
df[df['title'] == 'Animal Farm']

In [None]:
duplicate_rows = df[df.duplicated('title', keep=False)]
duplicate_rows.shape

Creating edition df, and changing column names:

In [None]:
name_map = {
    'numRatings': 'num_ratings', 
    'likedPercent': 'liked_percent', 
    'bbeScore': 'bbe_score',
    'bbeVotes': 'bbe_votes',
    'bookFormat': 'format',
    'publishDate': 'publish_date',
    'coverImg': 'cover_url',
    'characters': 'char_name'
    }

In [None]:
edition_df = df[['isbn', 'rating', 'numRatings', 'likedPercent', 'bbeScore', 'bbeVotes', 'bookFormat', 'edition', 'publishDate', 'description', 'language', 'pages', 'publisher', 'coverImg', 'price', 'edition_id', 'is_duplicate_isbn']].copy()
edition_df.rename(columns=name_map, inplace=True)

In [None]:
edition_df.head()

Changing all fields in df that contain a string that looks like a list literals into actual lists:

In [None]:
def apply_literal_eval(df, cols):
    """
    Apply literal evaluation to specified columns of a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to apply the transformation to.
    cols (list): A list of columns on which to apply ast.literal_eval.

    Returns:
    pd.DataFrame: DataFrame with the transformed columns.

    Raises:
    ValueError: If any column does not exist in the DataFrame or an error occurs while applying ast.literal_eval.
    """

    for col in cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame.")

        try:
            df[col] = df[col].apply(ast.literal_eval)
        except Exception as e:
            raise ValueError(f"An error occurred while applying ast.literal_eval to column '{col}'.") from e

    return df

# Usage:
cols = ['genres', 'characters', 'awards', 'ratingsByStars', 'setting']
df = apply_literal_eval(df, cols)

Getting only integers in pages col:

In [None]:
def rm_text_from_pages(edition_df):
    """
    Removes non-digit characters from the 'pages' column of the given DataFrame.

    Parameters:
    edition_df (pandas.DataFrame): DataFrame containing the 'pages' column.

    Returns:
    pandas.DataFrame: DataFrame with non-digit characters removed from the 'pages' column.

    Raises:
    TypeError: If edition_df is not a pandas DataFrame or does not contain the 'pages' column.
    """

    if not isinstance(edition_df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    if 'pages' not in edition_df.columns:
        raise TypeError("'pages' column must be present in the DataFrame")

    # Use regular expression to replace non-digit characters with NaN, and then fill NaN with a default value
    edition_df['pages'] = edition_df['pages'].astype(str).replace(r'\D', '', regex=True).replace('', np.nan).fillna(0).astype('int')
    return edition_df


# Usage
try:
    edition_df = rm_text_from_pages(edition_df)
except TypeError as e:
    print(f"An error occurred: {e}")




Formatting price column:

In [None]:
def reformat_price(edition_df):
    """
    Removes all occurrences of the period (.) in the 'price' column except for the last one.

    Parameters:
    edition_df (pandas.DataFrame): DataFrame containing the 'price' column.

    Returns:
    pandas.DataFrame: DataFrame with all but the last occurrence of the period removed from the 'price' column.

    Raises:
    TypeError: If edition_df is not a pandas DataFrame or does not contain the 'price' column.
    """

    if 'price' not in edition_df.columns:
        raise TypeError("'price' column must be present in the DataFrame")
 
    # Define a function to remove all but the last occurrence of the period
    def remove_all_except_last(price):
        if pd.isnull(price):
            return price
        parts = str(price).split('.')
        new_price = f"{''.join(parts[:-1])}.{parts[-1]}" if len(parts) > 1 else price
        return new_price

    # Apply the transformation to the 'price' column
    edition_df['price'] = edition_df['price'].apply(remove_all_except_last)

    return edition_df

# Usage:
edition_df = reformat_price(edition_df)


In [None]:
edition_df['price']

In [None]:
df["genres"].head()

Creating genre df, stripping whitespace, then filling columns

In [None]:
def create_genre_df(df):
    """
    Create a DataFrame by transforming the 'genres' column of the given DataFrame.
    The 'genres' column is assumed to contain lists of genres, and this function
    separates each genre into its own row.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'genres' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'genre', where each genre is in its own row.

    Raises:
    ValueError: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'edition_id' not in df.columns or 'genres' not in df.columns:
        raise ValueError("'edition_id' and 'genres' columns must be present in the DataFrame.")

    try:
        genre_df = df[['edition_id', 'genres']].copy()
        genre_df['genres'] = genre_df['genres'].apply(lambda genres: [genre.strip() for genre in genres])
        genre_df = genre_df.explode('genres')
        genre_df.rename(columns={'genres': 'genre'}, inplace=True)
    except Exception as e:
        raise ValueError("An error occurred while creating the genre DataFrame.") from e

    return genre_df

# Usage
genre_df = create_genre_df(df)

In [None]:
genre_df.head(20)

Creating character df and stripping whitespace:

In [None]:
def create_char_df(df):
    """
    Create a DataFrame by transforming the 'characters' column of the given DataFrame.
    The 'characters' column is assumed to contain lists of character names, and this function
    separates each character into its own row.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'characters' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'char_name', where each character name is in its own row.

    Raises:
    ValueError: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'edition_id' not in df.columns or 'characters' not in df.columns:
        raise ValueError("'edition_id' and 'characters' columns must be present in the DataFrame.")

    try:
        char_df = df[['edition_id', 'characters']].copy()
        char_df['characters'] = char_df['characters'].apply(lambda characters: [character.strip() for character in characters])
        char_df = char_df.explode('characters')
        char_df.rename(columns={'characters': 'char_name'}, inplace=True)
    except Exception as e:
        raise ValueError("An error occurred while creating the character DataFrame.") from e

    return char_df

# Example usage:
char_df = create_char_df(df)


In [None]:
char_df.head(20)

Creating setting df and stripping whitespace:

In [None]:
def create_setting_df(df):
    """
    Create a DataFrame by transforming the 'setting' column of the given DataFrame.
    The 'setting' column is assumed to contain lists of settings, and this function
    separates each setting into its own row.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'setting' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'setting', where each setting is in its own row.

    Raises:
    ValueError: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'edition_id' not in df.columns or 'setting' not in df.columns:
        raise ValueError("'edition_id' and 'setting' columns must be present in the DataFrame.")

    try:
        setting_df = df[['edition_id', 'setting']].copy()
        setting_df['setting'] = setting_df['setting'].apply(lambda settings: [setting.strip() for setting in settings])
        setting_df = setting_df.explode('setting')
    except Exception as e:
        raise ValueError("An error occurred while creating the setting DataFrame.") from e

    return setting_df

# Example usage:
setting_df = create_setting_df(df)

In [None]:
setting_df.head(20)

Creating star rating df and stripping whitespace: 
In this df, I have sliced off data where we are missing 1 or more fields from ratingsByStars, because I see no way to know which star rating the missing data belongs to.

In [None]:
def create_star_rating_df(df):
    """
    Create a DataFrame by transforming the 'ratingsByStars' column of the given DataFrame.
    The 'ratingsByStars' column is assumed to contain lists of star ratings, and this function
    extracts these ratings into individual columns.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'ratingsByStars' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id', 'five_star', 'four_star', 'three_star', 'two_star', and 'one_star'.

    Raises:
    ValueError: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'edition_id' not in df.columns or 'ratingsByStars' not in df.columns:
        raise ValueError("'edition_id' and 'ratingsByStars' columns must be present in the DataFrame.")

    try:
        # Ensure the 'ratingsByStars' column is a list of lists
        if df['ratingsByStars'].apply(type).eq(str).all():
            df['ratingsByStars'] = df['ratingsByStars'].apply(eval)

        # Create a mask where 'ratingsByStars' length is 5
        mask = df['ratingsByStars'].apply(len) == 5

        # Extract the 'ratingsByStars' values where the mask is True
        ratings_data = pd.DataFrame(df.loc[mask, 'ratingsByStars'].tolist(),
                                    columns=['five_star', 'four_star', 'three_star', 'two_star', 'one_star'])

        # Include the 'edition_id' column
        star_rating_df = ratings_data.assign(edition_id=df.loc[mask, 'edition_id'].values).reset_index(drop=True)

        # Optionally, strip any white spaces from string columns
        star_rating_df = star_rating_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
    except Exception as e:
        raise ValueError("An error occurred while creating the star rating DataFrame.") from e

    return star_rating_df

# Example usage:
star_rating_df = create_star_rating_df(df)


In [None]:
star_rating_df.head(10) 

In [None]:
# # Duplicated star ratings:
# duplicated_star_ratings = star_rating_df[star_rating_df.duplicated(keep=False)]
# duplicated_star_ratings


Creating award df:

In [None]:
def create_award_df(df):
    """
    Create a DataFrame by transforming the 'awards' column of the given DataFrame.
    The 'awards' column is assumed to contain lists of awards, and this function
    explodes these lists into individual rows.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'awards' columns.

    Returns:
    pd.DataFrame: A new DataFrame with columns 'edition_id' and 'award'.

    Raises:
    ValueError: If the required columns are not found in the DataFrame or an error occurs while processing.
    """

    if 'edition_id' not in df.columns or 'awards' not in df.columns:
        raise ValueError("'edition_id' and 'awards' columns must be present in the DataFrame.")

    try:
        award_df = df[['edition_id', 'awards']].copy()
        award_df = award_df.explode('awards')
        award_df = award_df.rename(columns={'awards': 'award'})
    except Exception as e:
        raise ValueError("An error occurred while creating the award DataFrame.") from e

    return award_df

# Example usage:
award_df = create_award_df(df)


In [None]:
award_df.head()

Splitting out year from award field and remove from award field and stripping whitespace::

In [None]:
def split_year(award_df):
    """
    Splits the 'award' column of the given DataFrame into 'award' and 'year' columns.
    The 'year' is extracted from the 'award' string using a regular expression pattern
    for years enclosed in parentheses.

    Parameters:
    award_df (pd.DataFrame): The DataFrame containing the 'award' column.

    Returns:
    pd.DataFrame: A new DataFrame with 'award' and 'year' columns.

    Raises:
    ValueError: If the required column is not found in the DataFrame or an error occurs while processing.
    """

    if 'award' not in award_df.columns:
        raise ValueError("'award' column must be present in the DataFrame.")

    try:
        year_pattern = r'\((\d{4})\)'
        award_df['award_year'] = award_df['award'].str.extract(year_pattern)
        award_df['award'] = award_df['award'].str.replace(year_pattern, '', regex=True)
        award_df = award_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
    except Exception as e:
        raise ValueError("An error occurred while splitting the year from the award column.") from e

    return award_df

# Example usage:
award_df = split_year(award_df)



In [None]:
award_df.head(10)

Creating creator df:

In [None]:
def create_creator_df(df):
    """
    Creates a DataFrame containing the 'edition_id' and 'creator' columns by splitting
    and exploding the 'author' column from the given DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'edition_id' and 'author' columns.

    Returns:
    pd.DataFrame: A new DataFrame with 'edition_id' and 'creator' columns.

    Raises:
    ValueError: If the required columns are not found in the DataFrame.
    """

    if 'edition_id' not in df.columns or 'author' not in df.columns:
        raise ValueError("'edition_id' and 'author' columns must be present in the DataFrame.")

    creator_df = df[['edition_id', 'author']].copy()
    creator_df['author'] = creator_df['author'].apply(lambda author: author.split(', ') if isinstance(author, str) else author)
    creator_df = creator_df.explode('author')
    creator_df.rename(columns={'author': 'creator'}, inplace=True)
    return creator_df

# Example usage:
creator_df = create_creator_df(df)



In [None]:
creator_df.head(20)

Splitting out role information into new column and stripping whitespace:

In [None]:
def split_role(creator_df):
    """
    Splits the 'creator' column into 'creator' and 'role' by extracting the role within parentheses.

    Parameters:
    creator_df (pd.DataFrame): The DataFrame containing the 'creator' column.

    Returns:
    pd.DataFrame: A new DataFrame with 'creator' and 'role' columns.

    Raises:
    ValueError: If the 'creator' column is not found in the DataFrame.
    """

    if 'creator' not in creator_df.columns:
        raise ValueError("'creator' column must be present in the DataFrame.")

    role_pattern = r'\((.*?)\)'
    creator_df['role'] = creator_df['creator'].str.extract(role_pattern)
    creator_df['creator'] = creator_df['creator'].str.replace(role_pattern, '', regex=True)
    creator_df = creator_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
    creator_df.rename(columns={'creator':'creator_name'}, inplace=True)

    return creator_df

# Example usage:
creator_df = split_role(creator_df)


In [None]:
creator_df.head(20)

In [None]:
df.columns

(If I were doing this again, I'd go back and create this name map at the start and immediately change the names. But I've used these variables in too many other places.) 

Creating book_df:

In [None]:
def create_book_df(df, name_map):
    """
    Creates a DataFrame with selected columns and renames them according to a given mapping.

    Parameters:
    df (pd.DataFrame): The original DataFrame containing book information.
    name_map (dict): A dictionary that maps the original column names to new names.

    Returns:
    pd.DataFrame: A new DataFrame with selected and renamed columns.

    Raises:
    KeyError: If any of the specified columns are not found in the DataFrame.
    """

    # Columns to be selected
    selected_columns = ['edition_id', 'title', 'firstPublishDate', 'series', 'series_num']

    # Check if all selected columns exist in the DataFrame
    if not all(col in df.columns for col in selected_columns):
        raise KeyError("Some of the selected columns were not found in the DataFrame.")

    book_df = df[selected_columns].copy()

    # Check if all keys in the name_map are in the selected columns
    if not all(key in selected_columns for key in name_map.keys()):
        raise KeyError("Some keys in the name_map were not found in the selected columns.")

    book_df.rename(columns=name_map, inplace=True)

    return book_df

# Example usage:
name_map = {'firstPublishDate': 'first_publish_date'}
book_df = create_book_df(df, name_map)


In [None]:
book_df.head()


In [None]:
## Creating tables

Creating edition table with dtypes

In [None]:
filtered_edition_df = genre_df[genre_df['genre'].astype(str).apply(len) > 25]
filtered_edition_df

In [None]:
for col in setting_df.columns:
    print(f'{col}: {setting_df[col].astype(str).apply(len).max()}, dtype: {setting_df[col].dtype}')


In [None]:
%%sql

DROP TABLE IF EXISTS edition CASCADE;
CREATE TABLE edition (
    edition_id serial PRIMARY KEY,
    isbn varchar(13),
    rating real,
    num_ratings bigint,
    liked_percent real,
    bbe_score bigint,
    bbe_votes bigint,
    format varchar(64),
    edition varchar(400),
    publish_date date,
    language varchar(64),
    pages smallint,
    publisher varchar(250),
    cover_url varchar(250),
    price real,
    is_duplicate_isbn boolean,
    description varchar(30000)
);

Creating genre table with dtypes

In [None]:
%%sql

DROP TABLE IF EXISTS genre;
CREATE TABLE genre (
    edition_id bigint NOT NULL, 
    FOREIGN KEY (edition_id) REFERENCES edition(edition_id),
    genre varchar(40)
);


In [None]:
for col in char_df.columns:
    print(f'{col}: {char_df[col].astype(str).apply(len).max()}, dtype: {char_df[col].dtype}')


Creating character table & datatypes

In [None]:
%%sql

DROP TABLE IF EXISTS character;
CREATE TABLE character (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition (edition_id),
    char_name varchar(255)
    );

Creating setting table:

In [None]:
%%sql

DROP TABLE IF EXISTS setting;
CREATE table setting (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition(edition_id),
    setting varchar(100)
);


Creating star_rating table:

In [None]:
%%sql

DROP TABLE IF EXISTS star_rating;
CREATE TABLE star_rating (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition(edition_id),
    five_star bigint,
    four_star bigint,
    three_star bigint,
    two_star bigint,
    one_star bigint
    );
    

Creating award table: 

In [None]:
%%sql

DROP TABLE IF EXISTS award;
CREATE TABLE award (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition(edition_id),
    award varchar(400),
    award_year smallint
);

In [None]:
%%sql

DROP TABLE IF EXISTS creator;
CREATE TABLE creator (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition (edition_id),
    creator_name varchar(128),
    role varchar(64)
);
    

Creating book table:

In [None]:
max_len = book_df['series_num'].astype(str).apply(len).max()
max_len

In [None]:
%%sql

DROP TABLE IF EXISTS title;
CREATE TABLE title (
    edition_id bigint NOT NULL,
    FOREIGN KEY (edition_id) REFERENCES edition (edition_id),
    title varchar(400),
    first_publish_date date,
    series varchar(128),
    series_num varchar(64)
);


Loading data from dfs into tables:

In [None]:
# Create sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost/books")

In [None]:
edition_df.to_sql('edition', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * from edition limit 5;

In [None]:
genre_df.to_sql('genre', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * from genre limit 25;

In [None]:
char_df.to_sql('character', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * from character limit 25;

In [None]:
setting_df.to_sql('setting', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * FROM setting limit 25;

In [None]:
star_rating_df.to_sql('star_rating', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * FROM star_rating LIMIT 5;

In [None]:
award_df.to_sql('award', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * FROM award LIMIT 25;

In [None]:
creator_df.to_sql('creator', engine, if_exists='append', index=False)

In [None]:
%%sql

SELECT * FROM creator LIMIT 5;

In [None]:
book_df.to_sql('title', engine, if_exists='append', index=False)

In [None]:
%%sql
SELECT * FROM title LIMIT 5;

In [None]:
%%sql

CREATE USER book_reader WITH PASSWORD 'read_only';
GRANT CONNECT ON DATABASE books TO book_reader;
GRANT USAGE ON SCHEMA public TO book_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO book_reader;
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO book_reader;
SELECT * FROM pg_roles;

Verifying that rating is in fact an averate of the ratings of the reviews. I know this is not universally true, bc there are cases where there is a rating and there are no reviews. But that is the exception, I think.

In [None]:
%%sql
WITH avg_rating AS (
    SELECT edition_id, ((5 * five_star) + (4 * four_star) + (3 * three_star) + (2 * two_star) + one_star) / (five_star + four_star + three_star + two_star + one_star)::FLOAT AS avg_rating
    FROM star_rating
)
SELECT a.avg_rating AS avg_rating, e.rating
FROM avg_rating a
JOIN edition e ON e.edition_id = a.edition_id
LIMIT 10;


In [None]:
%%sql
SELECT rating, num_ratings
FROM edition
WHERE num_ratings IS NULL or num_ratings = 0;

As we can see here there are 1200 editions (when no limit is present) with a rating, with num_ratings but with no actual star_ratings.

In [699]:
%%sql
SELECT t.title, e.rating, e.num_ratings, sr.edition_id AS edition_with_no_star_rating
FROM edition e
JOIN title t
ON e.edition_id = t.edition_id
FULL OUTER JOIN star_rating sr
ON e.edition_id = sr.edition_id
WHERE sr.edition_id IS NULL
ORDER BY e.num_ratings DESC
LIMIT 50;

 * postgresql://postgres:***@localhost/books
50 rows affected.


title,rating,num_ratings,edition_with_no_star_rating
City of God,3.93,10275,
A Life Once Lived,4.75,4,
Roma 40 D.C. – Destino de Amor,4.25,4,
Nine Brutal Years,4.25,4,
Saving Kennedy,4.75,4,
Cambodia and the Year of UNTAC (Essential Essays Book 67),4.0,4,
Take That to the Bank,5.0,4,
The Trinket Box,3.5,4,
"Hunters of Satan's Monsters (Legend of the Rolling Calf, #1)",5.0,4,
Midnight Hour Collection,3.5,4,
