In [None]:
import kagglehub
import pandas as pd
import ast
from collections import Counter

In [2]:
# this saves the code somewhere that is locally cached 
path = kagglehub.dataset_download("michaelrussell4/10000-books-and-their-genres-standardized")
fp = f"{path}/books_and_genres.csv"

# from the cache, loads the latest version 
df = pd.read_csv(fp)

print(df.head())
print(df.columns)

   Unnamed: 0                        title  \
0           0              apocolocyntosis   
1           1  the house on the borderland   
2           2                 the warriors   
3           3         a voyage to the moon   
4           4                 la fiammetta   

                                                text  \
0  Produced by Ted Garvin, Ben Courtney and PG Di...   
1  Produced by Suzanne Shell, Sjaani and PG Distr...   
2  Produced by Charles Aldarondo, Charlie Kirschn...   
3  Produced by Christine De Ryck, Stig M. Valstad...   
4  Produced by Ted Garvin, Dave Morgan and PG Dis...   

                                              genres  
0  {'21st-century', 'history', 'roman', 'classics...  
1  {'horror', 'mystery', 'classics', 'science-fic...  
2  {'literary-fiction', 'history', 'biography', '...  
3  {'20th-century', 'science-fiction', 'speculati...  
4  {'literary-fiction', 'history', 'feminism', 'c...  
Index(['Unnamed: 0', 'title', 'text', 'genres'], dtype='

In [3]:
# filter out non-fiction 
def parse_genres(x):
    return set(ast.literal_eval(x))

df["genres_parsed"] = df["genres"].apply(parse_genres)

# this line reduces size from 10635 to 7600
df = df[~df["genres_parsed"].apply(lambda s: "non-fiction" in s)]


In [4]:
# grab all genres
all_genres = [g for genre_set in df["genres_parsed"] for g in genre_set]

genre_counts = Counter(all_genres).most_common()

# rank of genres
for genre, count in genre_counts:
    print(f"{genre}: {count}")

fiction: 5019
classics: 3963
20th-century: 2277
literature: 2026
historical-fiction: 2015
novels: 1856
romance: 1681
short-stories: 1651
historical: 1370
history: 1294
fantasy: 1161
literary-fiction: 1012
adventure: 1012
childrens: 990
adult: 928
science-fiction: 927
mystery: 865
american: 808
drama: 783
school: 756
adult-fiction: 709
poetry: 668
young-adult: 649
humor: 615
contemporary: 565
read-for-school: 543
unfinished: 502
novella: 472
thriller: 444
mystery-thriller: 434
horror: 420
plays: 408
roman: 400
crime: 387
philosophy: 348
suspense: 320
biography: 319
speculative-fiction: 310
religion: 295
historical-romance: 289
comedy: 285
middle-grade: 283
realistic-fiction: 266
college: 251
family: 251
war: 245
christian: 239
theology: 239
picture-books: 234
amazon: 233
politics: 230
paranormal: 223
science: 223
mythology: 210
travel: 195
animals: 195
love: 186
supernatural: 182
action: 178
high-school: 174
contemporary-romance: 172
essays: 155
reference: 154
chick-lit: 147
dystopia: 1

In [5]:
# get rid of categories like fiction, literature, short-stories, novels, and 20th-century (not really a genre)
# combine historical-fiction and historical and history into historical-fiction

merge_genres = {"historical-fiction", "historical", "history"}
bad_genres = {"fiction", "literature", "short-stories", "novels", "20th-century"}

def clean_genre(genres_set): 
    if genres_set:
        genres_set = genres_set - merge_genres
        genres_set.add("historical-fiction")
    genres_set = genres_set - bad_genres
    return genres_set

df["genres_parsed"] = df["genres_parsed"].apply(clean_genre)

In [6]:
# check top 10 again 

all_genres = [g for genre_set in df["genres_parsed"] for g in genre_set]

genre_counts = Counter(all_genres).most_common()

# rank of genres
for genre, count in genre_counts:
    print(f"{genre}: {count}")

# extract genres of top 10
top_10 = set([genre for genre, count in genre_counts[:10]])


historical-fiction: 7600
classics: 3963
romance: 1681
fantasy: 1161
literary-fiction: 1012
adventure: 1012
childrens: 990
adult: 928
science-fiction: 927
mystery: 865
american: 808
drama: 783
school: 756
adult-fiction: 709
poetry: 668
young-adult: 649
humor: 615
contemporary: 565
read-for-school: 543
unfinished: 502
novella: 472
thriller: 444
mystery-thriller: 434
horror: 420
plays: 408
roman: 400
crime: 387
philosophy: 348
suspense: 320
biography: 319
speculative-fiction: 310
religion: 295
historical-romance: 289
comedy: 285
middle-grade: 283
realistic-fiction: 266
college: 251
family: 251
war: 245
christian: 239
theology: 239
picture-books: 234
amazon: 233
politics: 230
paranormal: 223
science: 223
mythology: 210
travel: 195
animals: 195
love: 186
supernatural: 182
action: 178
high-school: 174
contemporary-romance: 172
essays: 155
reference: 154
chick-lit: 147
dystopia: 145
sports: 143
coming-of-age: 140
magic: 136
art: 134
modern: 132
graphic-novels: 122
christmas: 122
erotica: 121


In [7]:
# filter for only top 10 and drop everything else 

def keep_top_10(genres_set):
    return genres_set & top_10  

df["genres_parsed"] = df["genres_parsed"].apply(keep_top_10)

# double check by printing this: 
print(set([g for genre_set in df["genres_parsed"] for g in genre_set]))

{'classics', 'literary-fiction', 'fantasy', 'adventure', 'historical-fiction', 'science-fiction', 'mystery', 'romance', 'childrens', 'adult'}


In [8]:
# check this to see which titles will be dropped  (there aren't any but keeping this here for checking)
print(df[df["genres_parsed"].apply(len) == 0])

# drop those rows 
df = df[df["genres_parsed"].apply(len) > 0]

Empty DataFrame
Columns: [Unnamed: 0, title, text, genres, genres_parsed]
Index: []


In [None]:
# next step: for each of the text, only grab the first 1000 sentences

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
df["text"] = df["text"].astype(str)

def truncate_text(text, max_sentences=1000):
    sentences = sent_tokenize(text)
    first = sentences[1:max_sentences + 1] # filtering out first sentence as well because that is just the filler text from gutenburg
    last = sentences[-(max_sentences + 1):] #since the last sentence is often "THE END" or stripped later using the footer
    return ' '.join(first), ' '.join(last)

df[["text_cut_first", "text_cut_last"]] = df["text"].apply(
    lambda t: pd.Series(truncate_text(t))
)

In [11]:
import re

def remove_gutenberg_footer(text):
    patterns = [
        r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*",
        r"End of the Project Gutenberg EBook of.*",
        r"End of Project Gutenberg's.*",
        r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*",
    ]
    
    for p in patterns:
        text = re.sub(p, "", text, flags=re.IGNORECASE | re.DOTALL)

    return text.strip()

df["text_cut_last_clean"] = df["text_cut_last"].apply(remove_gutenberg_footer)

In [None]:
desired_first = ["title", "text_cut_first", "genres_parsed"]
desired_last = ["title", "text_cut_last_clean", "genres_parsed"]

first_1000 = df[desired_first].rename(columns={
    "title": "title",
    "text_cut_first": "text",
    "genres_parsed": "genre"
})

last_1000 = df[desired_last].rename(columns={
    "title": "title",
    "text_cut_last_clean": "text",
    "genres_parsed": "genre"
})

first_1000.to_parquet("cleaned_data_first.parquet", engine="pyarrow", index=False)
last_1000.to_parquet("cleaned_data_last.parquet", engine="pyarrow", index=False)