# Preprocessing

In [6]:
import pandas as pd
import ast
import os
import pdb
import re
from pathlib import Path


from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

In [7]:
#project_root = os.path.abspath(os.path.join(os.path.dirname(__file__),'../..'))
project_root = Path().resolve().parent.parent

In [26]:
def load_and_preprocess():

    # Load and preprocess the dataset
    file_path = os.path.join(project_root,'booklore','raw_data','goodreads.csv')
    goodreads_df = pd.read_csv(file_path)

    # Drop rows with missing essential data and parse genres
    goodreads_df = goodreads_df.dropna(subset=['genres', 'rating','edition','publisher','numRatings'])
    goodreads_df['genres'] = goodreads_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

    # Setting up a pre filtered_df and defining the book_id for each entry
    filtered_df = goodreads_df[(goodreads_df['numRatings'] >= 1000) & (goodreads_df['rating'] >= 3.5)].reset_index(drop=True)
    filtered_df['bookId'] = filtered_df['bookId'].apply(lambda x: str(re.match(r'^\d+', x).group()) if isinstance(x, str) else None)

    # Ohe
    mlb = MultiLabelBinarizer()
    genre_features = pd.DataFrame(mlb.fit_transform(filtered_df['genres']),columns=mlb.classes_)
    publisher_features = pd.DataFrame(mlb.fit_transform(filtered_df['publisher']),columns=mlb.classes_)
    edition_features = pd.DataFrame(mlb.fit_transform(filtered_df['edition']),columns=mlb.classes_)

    categ_features = pd.concat([genre_features,publisher_features,edition_features]).reset_index(drop=True)

    # MinMax Scaler
    scaler = MinMaxScaler()
    rating_features = scaler.fit_transform(filtered_df[['rating']])
    rating_features_df = pd.DataFrame(rating_features, columns=['scaled_rating']).reset_index(drop=True)

    # Combining them alltogether
    book_features = pd.concat([
        filtered_df[['bookId']].reset_index(drop=True),
        categ_features,
        rating_features_df
    ],
                            axis=1)

    return filtered_df, book_features


In [27]:
load_and_preprocess()

(       bookId                                              title  \
 0     2767052                                   The Hunger Games   
 1           2          Harry Potter and the Order of the Phoenix   
 2        1885                                Pride and Prejudice   
 3       19063                                     The Book Thief   
 4       11127                           The Chronicles of Narnia   
 ...       ...                                                ...   
 2937   357298  Matter and Consciousness: A Contemporary Intro...   
 2938  1124895                                   Isle of the Dead   
 2939   149709  The Madwoman in the Attic: The Woman Writer an...   
 2940    10264                             Jews, God, and History   
 2941   137491                         The Toothpaste Millionaire   
 
                                                  series  \
 0                                   The Hunger Games #1   
 1                                       Harry Pot

In [None]:
pd.DataFrame(load_and_preprocess()[1]).head(5).columns

Index(['bookId', '12th Century', '13th Century', '14th Century',
       '15th Century', '16th Century', '17th Century', '18th Century',
       '19th Century', '20th Century',
       ...
       'ミ', 'ャ', 'ン', 'ー', '修', '庫', '改', '文', '装', 'scaled_rating'],
      dtype='object', length=878)

In [16]:
file_path = os.path.join(project_root,'booklore','raw_data','goodreads.csv')
goodreads_df = pd.read_csv(file_path)

In [17]:
goodreads_df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')