# Data Loading

In [1]:
#libraries load
import pandas as pd
import os
import gzip
import json
import numpy as np
import random


In [2]:
#current folder directory set up
DIR = os.path.join(os.getcwd(), 'Data')

In [3]:
#check the expected files using dataset_names.csv
file_path = os.path.join(DIR, 'dataset_names.csv')
file_names = pd.read_csv(file_path)
display(file_names)

Unnamed: 0,type,name
0,complete,goodreads_book_works.json.gz
1,complete,goodreads_book_authors.json.gz
2,complete,goodreads_book_series.json.gz
3,complete,goodreads_books.json.gz
4,complete,goodreads_book_genres_initial.json.gz
5,byGenre,goodreads_books_children.json.gz
6,byGenre,goodreads_books_comics_graphic.json.gz
7,byGenre,goodreads_books_fantasy_paranormal.json.gz
8,byGenre,goodreads_books_history_biography.json.gz
9,byGenre,goodreads_books_mystery_thriller_crime.json.gz


In [4]:
#check the available files in the folder (note that not all of them were needed)
available_files = os.listdir(DIR)
for name in file_names["name"]:
    if name in available_files:
        print(f"found: {name}")
    else:
        print(f"missing: {name}")

found: goodreads_book_works.json.gz
found: goodreads_book_authors.json.gz
found: goodreads_book_series.json.gz
found: goodreads_books.json.gz
found: goodreads_book_genres_initial.json.gz
found: goodreads_books_children.json.gz
found: goodreads_books_comics_graphic.json.gz
found: goodreads_books_fantasy_paranormal.json.gz
found: goodreads_books_history_biography.json.gz
found: goodreads_books_mystery_thriller_crime.json.gz
found: goodreads_books_poetry.json.gz
found: goodreads_books_romance.json.gz
found: goodreads_books_young_adult.json.gz
found: goodreads_interactions_children.json.gz
found: goodreads_interactions_comics_graphic.json.gz
found: goodreads_interactions_fantasy_paranormal.json.gz
found: goodreads_interactions_history_biography.json.gz
found: goodreads_interactions_mystery_thriller_crime.json.gz
found: goodreads_interactions_poetry.json.gz
found: goodreads_interactions_romance.json.gz
found: goodreads_interactions_young_adult.json.gz
found: goodreads_reviews_children.json.

In [None]:
#function to load the data and open the compressed files (base inspiration source: the goodreads dataset github)

def load_data(file_name, sample_size=100000, head=None):
    """
    Load line-delimited JSON records from a compressed .json.gz file.
    - If head is set => return the first N rows quickly.
    - Else => use reservoir sampling to get a random sample of sample_size rows.
    """
    data = []
    with gzip.open(file_name, 'rt', encoding='utf-8') as fin:
        if head is not None:
            # just take the first 'head' rows
            for idx, line in enumerate(fin):
                if idx >= head:
                    break
                data.append(json.loads(line))
            return pd.DataFrame(data)
        else:
            # reservoir sampling
            sample = []
            for idx, line in enumerate(fin, 1):
                record = json.loads(line)
                if len(sample) < sample_size:
                    sample.append(record)
                else:
                    j = random.randint(0, idx-1)
                    if j < sample_size:
                        sample[j] = record
            return pd.DataFrame(sample)

In [7]:
#sample record from books dataset (only for display and format familiarization)
books_sample = load_data(os.path.join(DIR, 'goodreads_books.json.gz'), head=1)
print('sample book record')
display(books_sample.sample(1))

sample book record


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9,,1984,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film


In [9]:
#main dataset (sample 100,000 books for efficiency)
books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'), sample_size=100000)
print("books loaded (sampled 100k)")

books loaded (sampled 100k)


In [10]:
#other metadata datasets (sample random 100k for efficiency and fairness => unique )
bookworks = load_data(os.path.join(DIR, 'goodreads_book_works.json.gz'), sample_size=100000)
print("book works loaded")

authors = load_data(os.path.join(DIR, 'goodreads_book_authors.json.gz'), sample_size=100000)
print("authors loaded")

series = load_data(os.path.join(DIR, 'goodreads_book_series.json.gz'), sample_size=100000)
print("series loaded")

reviews = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'), sample_size=100000)
print("reviews loaded")

fuzzy_genres = load_data(os.path.join(DIR, 'goodreads_book_genres_initial.json.gz'), sample_size=100000)
print("fuzzy genres loaded")

book works loaded
authors loaded
series loaded
reviews loaded
fuzzy genres loaded


In [11]:
#csv read (these are smaller so load fully)
user_interaction = pd.read_csv(os.path.join(DIR, 'goodreads_interactions.csv'))
book_id_map = pd.read_csv(os.path.join(DIR, 'book_id_map.csv'))
user_id_map = pd.read_csv(os.path.join(DIR, 'user_id_map.csv'))

In [12]:
print(user_id_map.columns)
print(book_id_map.columns)

Index(['user_id_csv', 'user_id'], dtype='object')
Index(['book_id_csv', 'book_id'], dtype='object')


In [13]:
print(user_interaction.columns)
print(book_id_map.columns)

Index(['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed'], dtype='object')
Index(['book_id_csv', 'book_id'], dtype='object')


In [14]:
# merge interactions with book_id_map
user_interaction = user_interaction.merge(
    book_id_map, left_on='book_id', right_on='book_id_csv', how='left'
)

# merge interactions with user_id_map
user_interaction = user_interaction.merge(
    user_id_map, left_on='user_id', right_on='user_id_csv', how='left'
)

# drop the redundant csv columns
user_interaction = user_interaction.drop(columns=['book_id_csv', 'user_id_csv'])

print("User interactions mapped with book and user ids")

User interactions mapped with book and user ids


In [15]:
#sample 100,000 interactions for efficiency
user_interaction_sample = user_interaction.sample(n=100000, random_state=42)
print("user interactions sampled")

user interactions sampled


In [17]:
#genres => load datasets that are genre specific (sample 100,000 each)
books_children = load_data(os.path.join(DIR, 'goodreads_books_children.json.gz'), sample_size=100000)
interactions_children = load_data(os.path.join(DIR, 'goodreads_interactions_children.json.gz'), sample_size=100000)
reviews_children = load_data(os.path.join(DIR, 'goodreads_reviews_children.json.gz'), sample_size=100000)
print("children loaded")

books_comics = load_data(os.path.join(DIR, 'goodreads_books_comics_graphic.json.gz'), sample_size=100000)
interactions_comics = load_data(os.path.join(DIR, 'goodreads_interactions_comics_graphic.json.gz'), sample_size=100000)
reviews_comics = load_data(os.path.join(DIR, 'goodreads_reviews_comics_graphic.json.gz'), sample_size=100000)
print("comics loaded")

books_fantasy = load_data(os.path.join(DIR, 'goodreads_books_fantasy_paranormal.json.gz'), sample_size=100000)
interactions_fantasy = load_data(os.path.join(DIR, 'goodreads_interactions_fantasy_paranormal.json.gz'), sample_size=100000)
reviews_fantasy = load_data(os.path.join(DIR, 'goodreads_reviews_fantasy_paranormal.json.gz'), sample_size=100000)
print("fantasy loaded")

books_mystery = load_data(os.path.join(DIR, 'goodreads_books_mystery_thriller_crime.json.gz'), sample_size=100000)
interactions_mystery = load_data(os.path.join(DIR, 'goodreads_interactions_mystery_thriller_crime.json.gz'), sample_size=100000)
reviews_mystery = load_data(os.path.join(DIR, 'goodreads_reviews_mystery_thriller_crime.json.gz'), sample_size=100000)
print("mystery loaded")

books_poetry = load_data(os.path.join(DIR, 'goodreads_books_poetry.json.gz'), sample_size=100000)
interactions_poetry = load_data(os.path.join(DIR, 'goodreads_interactions_poetry.json.gz'), sample_size=100000)
reviews_poetry = load_data(os.path.join(DIR, 'goodreads_reviews_poetry.json.gz'), sample_size=100000)
print("poetry loaded")

books_romance = load_data(os.path.join(DIR, 'goodreads_books_romance.json.gz'), sample_size=100000)
interactions_romance = load_data(os.path.join(DIR, 'goodreads_interactions_romance.json.gz'), sample_size=100000)
reviews_romance = load_data(os.path.join(DIR, 'goodreads_reviews_romance.json.gz'), sample_size=100000)
print("romance loaded")

books_young_adult = load_data(os.path.join(DIR, 'goodreads_books_young_adult.json.gz'), sample_size=100000)
interactions_young_adult = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'), sample_size=100000)
reviews_young_adult = load_data(os.path.join(DIR, 'goodreads_reviews_young_adult.json.gz'), sample_size=100000)
print("young adult loaded")

book_history = load_data(os.path.join(DIR, 'goodreads_books_history_biography.json.gz'), sample_size=100000)
interactions_history = load_data(os.path.join(DIR, 'goodreads_interactions_history_biography.json.gz'), sample_size=100000)
reviews_history = load_data(os.path.join(DIR, 'goodreads_reviews_history_biography.json.gz'), sample_size=100000)
print("history loaded")


children loaded
comics loaded
fantasy loaded
mystery loaded
poetry loaded
romance loaded
young adult loaded
history loaded


In [18]:
#put loaded datasets into a dictionary
datasets = {
    "books": books,
    "bookworks": bookworks,
    "authors": authors,
    "series": series,
    "reviews": reviews,
    "fuzzy_genres": fuzzy_genres,

    #genre-specific
    "books_children": books_children,
    "interactions_children": interactions_children,
    "reviews_children": reviews_children,

    "books_comics": books_comics,
    "interactions_comics": interactions_comics,
    "reviews_comics": reviews_comics,

    "books_fantasy": books_fantasy,
    "interactions_fantasy": interactions_fantasy,
    "reviews_fantasy": reviews_fantasy,

    "books_mystery": books_mystery,
    "interactions_mystery": interactions_mystery,
    "reviews_mystery": reviews_mystery,

    "books_poetry": books_poetry,
    "interactions_poetry": interactions_poetry,
    "reviews_poetry": reviews_poetry,

    "books_romance": books_romance,
    "interactions_romance": interactions_romance,
    "reviews_romance": reviews_romance,

    "books_young_adult": books_young_adult,
    "interactions_young_adult": interactions_young_adult,
    "reviews_young_adult": reviews_young_adult,

    "books_history": book_history,
    "interactions_history": interactions_history,
    "reviews_history": reviews_history,

    #csv-based
    "user_interaction": user_interaction_sample,
    "book_id_map": book_id_map,
    "user_id_map": user_id_map,
}


In [19]:
import pickle

# base directory
PICKLE_DIR = os.path.join(DIR, "Pickle")
os.makedirs(PICKLE_DIR, exist_ok=True)  # create folder if it doesn't exist

# save with pickle into Pickle folder
for name, df in datasets.items():
    file_path = os.path.join(PICKLE_DIR, f"{name}.pkl")
    with open(file_path, "wb") as f:
        pickle.dump(df, f)
    print(f"saved {name}.pkl to {PICKLE_DIR}")

print("all datasets saved to Pickle folder successfully!")


saved books.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved bookworks.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved authors.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved series.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved reviews.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved fuzzy_genres.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved books_children.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved interactions_children.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved reviews_children.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved books_comics.pkl to c:\Users\nourh\Documents\GoodReadsRecommendationSystem\Data\Pickle
saved interactions_comics.pkl to c:\Users\nourh\Documents\GoodReadsRecommendation

## datasets that need full load => might be helpful going forward

In [20]:
# helper to fully load  JSON from .json.gz 
def load_full_data(file_name, head=None):
    """
    Fully load line-delimited JSON records from a compressed .json.gz file.
    - If head is set: return only the first N rows.
    - Else: load the entire dataset.
    Returns a Pandas DataFrame.
    """
    data = []
    with gzip.open(file_name, 'rt', encoding='utf-8') as fin:
        for idx, line in enumerate(fin):
            record = json.loads(line)
            data.append(record)
            if head is not None and idx + 1 >= head:
                break
    return pd.DataFrame(data)


In [42]:
# define directory
FULL_PICKLE_DIR = os.path.join(os.getcwd(), "Data", "FullyLoaded")

# create new pickle folder if it doesn't exist
os.makedirs(FULL_PICKLE_DIR, exist_ok=True)

In [22]:
# full load 
authors_full = load_full_data(os.path.join(DIR, "goodreads_book_authors.json.gz"))
print("authors loaded:", authors_full.shape)

series_full = load_full_data(os.path.join(DIR, "goodreads_book_series.json.gz"))
print("series loaded:", series_full.shape)

bookworks_full = load_full_data(os.path.join(DIR, "goodreads_book_works.json.gz"))
print("bookworks loaded:", bookworks_full.shape)

fuzzy_genres_full = load_full_data(os.path.join(DIR, "goodreads_book_genres_initial.json.gz"))
print("fuzzy genres loaded:", fuzzy_genres_full.shape)



authors loaded: (829529, 5)
series loaded: (400390, 7)
bookworks loaded: (1521962, 16)
fuzzy genres loaded: (2360655, 2)


In [23]:
# create a  dictionary
datasets_full = {
    "user_interaction": user_interaction,
    "authors_full": authors_full,
    "series_full": series_full,
    "bookworks_full": bookworks_full,
    "fuzzy_genres_full": fuzzy_genres_full
}


In [24]:
# save to FullyLoaded pickle folder
for name, df in datasets_full.items():
    df.to_pickle(os.path.join(FULL_PICKLE_DIR, f"{name}.pkl"))
    print(f" {name} saved to FullyLoaded.")

 user_interaction saved to FullyLoaded.
 authors_full saved to FullyLoaded.
 series_full saved to FullyLoaded.
 bookworks_full saved to FullyLoaded.
 fuzzy_genres_full saved to FullyLoaded.


### Adjusting time columns in reviews

In [51]:
date_cols = ["date_added", "date_updated", "read_at", "started_at"] #define columns you want to make sure are the correct type


for col in date_cols:
    if col in reviews.columns:  # safety check
        reviews[col] = pd.to_datetime(reviews[col], errors="coerce")#convert to NaN instead of error 


In [52]:
print(reviews[date_cols].dtypes)

date_added      datetime64[ns, tzoffset(None, -25200)]
date_updated    datetime64[ns, tzoffset(None, -25200)]
read_at         datetime64[ns, tzoffset(None, -25200)]
started_at      datetime64[ns, tzoffset(None, -25200)]
dtype: object


In [54]:
# verify dtypes conversion
print(reviews[date_cols].dtypes)

date_added      datetime64[ns, tzoffset(None, -25200)]
date_updated    datetime64[ns, tzoffset(None, -25200)]
read_at         datetime64[ns, tzoffset(None, -25200)]
started_at      datetime64[ns, tzoffset(None, -25200)]
dtype: object


In [55]:
# first 5 rows 
reviews[["date_added", "date_updated", "read_at", "started_at"]].head()

Unnamed: 0,date_added,date_updated,read_at,started_at
0,2017-07-03 06:11:47-07:00,2017-08-09 12:57:12-07:00,NaT,NaT
1,2012-04-16 01:24:18-07:00,2012-04-16 01:26:57-07:00,1998-06-30 00:00:00-07:00,NaT
2,2016-08-17 15:57:26-07:00,2016-09-02 17:54:46-07:00,2016-09-02 00:00:00-07:00,2016-08-17 00:00:00-07:00
3,NaT,NaT,NaT,NaT
4,NaT,NaT,NaT,NaT


In [None]:
reviews[["date_added", "date_updated", "read_at", "started_at"]].tail() #last five

Unnamed: 0,date_added,date_updated,read_at,started_at
99995,2011-08-20 13:50:26-07:00,2011-08-27 10:07:49-07:00,2011-08-01 00:00:00-07:00,NaT
99996,NaT,NaT,2014-11-01 00:00:00-07:00,NaT
99997,NaT,NaT,2016-08-01 00:00:00-07:00,NaT
99998,NaT,2013-03-29 09:22:14-07:00,NaT,NaT
99999,2017-10-24 13:56:07-07:00,2017-10-24 14:12:32-07:00,NaT,NaT


In [56]:
#check how many missing
print("Missing values per date column:")
reviews[["date_added", "date_updated", "read_at", "started_at"]].isna().sum()

Missing values per date column:


date_added      34560
date_updated    33377
read_at         46457
started_at      61952
dtype: int64

In [57]:
# cleaned "currently reading" => trendy books users are reading recently
reviews_started = reviews.dropna(subset=["started_at"]).copy()
print("Started_at Cleaned Data:", reviews_started.shape)

# cleaned "buzzing books" => trendy books users are reviewing recently 
reviews_added = reviews.dropna(subset=["date_added"]).copy()
print("Date_added Cleaned Data:", reviews_added.shape)

# cleaned for "page turners" => books people are finishing and not dropping recently
reviews_read = reviews.dropna(subset=["read_at"]).copy()
print("Read_at Cleaned Data:", reviews_read.shape)

#done this way to avoid having really small data 

Started_at Cleaned Data: (38048, 11)
Date_added Cleaned Data: (65440, 11)
Read_at Cleaned Data: (53543, 11)


In [None]:
# save cleaned datasets
reviews_started_path = os.path.join(FULL_PICKLE_DIR, "reviews_started.pkl")
reviews_added_path   = os.path.join(FULL_PICKLE_DIR, "reviews_added.pkl")
reviews_read_path    = os.path.join(FULL_PICKLE_DIR, "reviews_read.pkl")

reviews_started.to_pickle(reviews_started_path)
reviews_added.to_pickle(reviews_added_path)
reviews_read.to_pickle(reviews_read_path)

print("Saved reviews_started:", reviews_started.shape)
print("Saved reviews_added:", reviews_added.shape)
print("Saved reviews_read:", reviews_read.shape)

Saved reviews_started: (38048, 11)
Saved reviews_added: (65440, 11)
Saved reviews_read: (53543, 11)
