In [1]:
# Per migliorare la qualità del dataset, rielaboro la feature "categories", inizialmente poco informativa
# Attraverso l'uso di modelli LLM, applico una tecnica di text classification per assegnare a ciascun libro una categoria coerente
# Questa nuova classificazione verrà utilizzata dall'utente come filtro durante la ricerca dei titoli

# La text classification è una tecnica di NLP che consente di associare testi a classi discrete
# In questo caso, le categorie di interesse sono: Fiction, Nonfiction (+ Juvenile Fiction e Juvenile Nonfiction)
# Esistono molte tecniche per classificare il testo, ma l'uso di LLM preaddestrati garantisce ottimi risultati

# In particolare, utilizzo la strategia di zero-shot classification
# Consente di classificare testi in categorie mai viste esplicitamente durante il training, grazie alle conoscenze apprese dal modello
# Si utilizza un modello preaddestrato che prende un testo e un insieme di etichette candidate (es. Fiction, Nonfiction)
# Produce un punteggio di similarità tra il testo e ciascuna etichetta
# L'etichetta con il punteggio più alto viene assegnata al testo

In [2]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [3]:
print(books["categories"].nunique())
print(books["categories"].value_counts())

464
categories
Fiction                      2062
Juvenile Fiction              383
Biography & Autobiography     310
History                       205
Literary Criticism            123
                             ... 
Rock musicians                  1
Community life                  1
Air pilots                      1
Authors, Canadian               1
Indic fiction (English)         1
Name: count, Length: 464, dtype: int64


In [4]:
# Guardo le categorie che hanno 25 libri associati o più
books["categories"].value_counts().reset_index().query("count >= 25")

Unnamed: 0,categories,count
0,Fiction,2062
1,Juvenile Fiction,383
2,Biography & Autobiography,310
3,History,205
4,Literary Criticism,123
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,114
8,Drama,81
9,Juvenile Nonfiction,56


In [7]:
# La categoria "Juvenile Fiction" sembra rappresenti libri di Fiction per bambini
books[books["categories"] == "Juvenile Fiction"]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006: Joe and his parents are enjoyin...
77,9780020442608,0020442602,The voyage of the Dawn Treader,Clive Staples Lewis,Juvenile Fiction,http://books.google.com/books/content?id=fDD3C...,"The ""Dawn Treader"" is the first ship Narnia ha...",1970.0,4.09,216.0,2869.0,The voyage of the Dawn Treader,"9780020442608: The ""Dawn Treader"" is the first..."
83,9780030547744,0030547741,Where the Red Fern Grows,Wilson Rawls,Juvenile Fiction,http://books.google.com/books/content?id=IHpRw...,A young boy living in the Ozarks achieves his ...,2000.0,4.37,288.0,95.0,Where the Red Fern Grows: The Story of Two Dog...,9780030547744: A young boy living in the Ozark...
84,9780060000141,0060000147,Poppy's Return,Avi,Juvenile Fiction,http://books.google.com/books/content?id=XbcMJ...,"There's trouble at Gray House, the girlhood ho...",2006.0,3.99,256.0,1086.0,Poppy's Return,"9780060000141: There's trouble at Gray House, ..."
85,9780060001537,0060001534,Diary of a Spider,Doreen Cronin,Juvenile Fiction,http://books.google.com/books/content?id=UWvZo...,This is the diary ... of a spider. But don't b...,2005.0,4.25,40.0,7903.0,Diary of a Spider,9780060001537: This is the diary ... of a spid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,9781590385814,1590385810,Fablehaven,Brandon Mull,Juvenile Fiction,http://books.google.com/books/content?id=tbVIP...,When Kendra and Seth go to stay at their grand...,2006.0,4.09,351.0,111896.0,Fablehaven,9781590385814: When Kendra and Seth go to stay...
4830,9781596792500,1596792507,Sherlock Holmes and the Case of the Hound of t...,Arthur Conan Doyle;Malvina G. Vogel,Juvenile Fiction,http://books.google.com/books/content?id=EWgWP...,Sherlock Holmes and Dr. Watson travel to the b...,2005.0,4.51,237.0,28.0,Sherlock Holmes and the Case of the Hound of t...,9781596792500: Sherlock Holmes and Dr. Watson ...
4842,9781599900056,159990005X,The Drift House,Dale Peck,Juvenile Fiction,http://books.google.com/books/content?id=kbwPY...,Sent to stay with their uncle in a ship-like h...,2006.0,3.64,437.0,595.0,The Drift House: The First Voyage,9781599900056: Sent to stay with their uncle i...
4907,9781844580514,1844580512,Attack of the Jaguar,M. A. Harvey,Juvenile Fiction,http://books.google.com/books/content?id=3HUdt...,This training manual for operatives of Xtreme ...,2004.0,3.40,125.0,4.0,Attack of the Jaguar,9781844580514: This training manual for operat...


In [8]:
# Libri per bambini ma NonFiction
books[books["categories"] == "Juvenile Nonfiction"]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
104,9780060277406,0060277408,The Secret Garden Cookbook,Amy Cotler,Juvenile Nonfiction,http://books.google.com/books/content?id=c7E_H...,Frances Hodgson Burnett's The Secret Garden de...,1999.0,4.28,128.0,142.0,The Secret Garden Cookbook: Recipes Inspired b...,9780060277406: Frances Hodgson Burnett's The S...
105,9780060278427,0060278420,Laura's Album,William Anderson,Juvenile Nonfiction,http://books.google.com/books/content?id=_zTkq...,Though best known as the author of the Little ...,1998.0,4.3,80.0,713.0,Laura's Album: A Remembrance Scrapbook of Laur...,9780060278427: Though best known as the author...
225,9780060782139,0060782137,Time For Kids: Butterflies!,Editors of TIME For Kids,Juvenile Nonfiction,http://books.google.com/books/content?id=OdZxn...,"Butterflies There are 20,000 different kinds o...",2006.0,4.0,32.0,20.0,Time For Kids: Butterflies!,"9780060782139: Butterflies There are 20,000 di..."
264,9780060882600,0060882603,The Annotated Charlotte's Web,E. B. White,Juvenile Nonfiction,http://books.google.com/books/content?id=vaYYH...,"Charlotte's Web, one of America's best-loved c...",2006.0,4.16,320.0,41.0,The Annotated Charlotte's Web,"9780060882600: Charlotte's Web, one of America..."
430,9780064462044,0064462048,My Little House Crafts Book,Carolyn Strom Collins,Juvenile Nonfiction,http://books.google.com/books/content?id=lTzrs...,Make the same pioneer crafts that Laura did! I...,1998.0,4.05,64.0,56.0,My Little House Crafts Book: 18 Projects from ...,9780064462044: Make the same pioneer crafts th...
431,9780064462341,006446234X,Pioneer Girl,William Anderson,Juvenile Nonfiction,http://books.google.com/books/content?id=Sj4UD...,The pioneer spirit lives on... Readers around ...,2000.0,4.15,32.0,414.0,Pioneer Girl: The Story of Laura Ingalls Wilder,9780064462341: The pioneer spirit lives on... ...
435,9780066236179,0066236177,A Light in the Attic Book and CD,Shel Silverstein,Juvenile Nonfiction,http://books.google.com/books/content?id=FJfQs...,Last night while I lay thinking here Some What...,2001.0,4.34,176.0,590.0,A Light in the Attic Book and CD,9780066236179: Last night while I lay thinking...
794,9780142302279,0142302279,Dirty Beasts,Roald Dahl,Juvenile Nonfiction,,Poems tell the stories of a smart pig who outw...,2002.0,4.02,32.0,3953.0,Dirty Beasts,9780142302279: Poems tell the stories of a sma...
803,9780142407226,0142407224,The Tough Guide to Fantasyland,Diana Wynne Jones,Juvenile Nonfiction,http://books.google.com/books/content?id=v5jxA...,A unique guide to fantasy literature helps rea...,2006.0,3.94,234.0,3897.0,The Tough Guide to Fantasyland,9780142407226: A unique guide to fantasy liter...
805,9780142407929,0142407925,The BFG,Roald Dahl;David Wood;Jane Walmsley,Juvenile Nonfiction,http://books.google.com/books/content?id=aDf-O...,"With notes on staging, props, and costumes, a ...",2007.0,4.26,128.0,452.0,The BFG: A Set of Plays,"9780142407929: With notes on staging, props, a..."


In [11]:
# Mappo i generi trovati sopra nelle categorie Fiction o NonFiction
# Queste sembrano categorie di libri più interessanti, che possano guidare l'utente ad una scelta più mirata delle letture cercate

category_mapping = {
	"Fiction": 						"Fiction",
	"Juvenile Fiction": 			"Children's Fiction",
	"Biography & Autobiography":	"Nonfiction",
	"History": 						"Nonfiction",
	"Literary Criticism": 			"Nonfiction",
	"Philosophy": 					"Nonfiction",
	"Religion":						"Nonfiction",
	"Comics & Graphic Novels": 		"Fiction",
	"Drama": 						"Fiction",
	"Juvenile Nonfiction": 			"Children's Nonfiction",
	"Science": 						"Nonfiction",
	"Literary Collections": 		"Fiction",
	"Poetry": 						"Fiction",
	"Business & Economics": 		"Nonfiction",
	"Social Science": 				"Nonfiction",
	"Performing Arts": 				"Nonfiction",
	"Cooking": 						"Nonfiction",
	"Psychology": 					"Nonfiction",
	"Travel": 						"Nonfiction",
	"Body, Mind & Spirit": 			"Nonfiction",
	"Art": 							"Nonfiction",
	"Political Science": 			"Nonfiction",
	"Computers": 					"Nonfiction",
	"Health & Fitness": 			"Nonfiction",
	"Self-Help": 					"Nonfiction",
	"Family & Relationships": 		"Nonfiction"
}

books["simple_categories"] = books["categories"].map(category_mapping)

In [15]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934: ""In The Problem of Pain, C.S. L...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5084,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222: On A Train Journey Home To Nort...,
5085,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014: This book tells the tale of a m...,
5086,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623: Wisdom to Create a Life of Pass...,Nonfiction
5087,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535: This collection of the timeless...,Nonfiction


In [17]:
books[~books["simple_categories"].isna()]

# Libri catalogati secondo questa nuova feature sono 4161/5089 (ne rimangono ancora non catalogati secondo Fiction o NonFiction)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006: Joe and his parents are enjoyin...,Children's Fiction
31,9780007105045,0007105045,Tree and Leaf,John Ronald Reuel Tolkien,Literary Collections,http://books.google.com/books/content?id=aPb_A...,"""The two works 'On fairy-stories' and 'Leaf by...",2001.0,4.09,176.0,2245.0,Tree and Leaf: The Homecoming of Beorhtnoth : ...,"9780007105045: ""The two works 'On fairy-storie...",Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5080,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969: Rescued from the lockers in whi...,Fiction
5081,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850: This book is the story of a you...,Fiction
5086,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623: Wisdom to Create a Life of Pass...,Nonfiction
5087,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535: This collection of the timeless...,Nonfiction


In [18]:
# La piattaforma HugginFace offre molti modelli open-source per fare zero-shot classification
# Un modello simile sarà anche utilizzato per fare sentiment analysis sulle descrizioni dei libri, per filtrarli in base al mood di lettura

# Pycharm ha una funzionalità che permette di scegliere da IDE direttamente il modello a cui fare riferimento
from transformers import pipeline

fiction_categories = ["Fiction", "Nonfiction"] # Label di classificazione

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")




Device set to use cpu


In [19]:
# Primo esempio di libro Fiction
# Lo utilizzo per fare una prova sul modello appena caricato
books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world ha

In [20]:
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
pipe (sequence, fiction_categories)

# Verifichiamo che il modello predica correttamente la classe di appartenza di questo libro Fiction alla classe esatta
# Non è massimo il punteggio di confidence nella assegnazione, ma abbastanza alto per considerare questo modello come accettabile per fare predizione

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [21]:
# Per estrarre la label risultante dalla predizione si prende l'indice all'interno della lista che con la maggiore probabilità
import numpy as np

max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
max_label

'Fiction'

In [22]:
# Si riunisce tutto all'interno di una funzione per automatizzare il processo
def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]

    return max_label

In [23]:
# Ora si può usare il modello per fare una zero-shot classification sulle descrizioni e assegnare la label più probabile tra Fiction e NonFiction
# Vogliamo vedere quanto il modello performi bene in questo compito, e lo faremo considerando 
# un insieme di esempi che daremo in pasto al modello, e dopo la predizione dovremo confrontarli
# con le label (fiction, nonfiction) esatte

# Questa libreria serve a monitorare il tempo rimasto al completamento delle operazioni
from tqdm import tqdm

actual_cats = []
predicted_cats = [] # Qui saranno salvati i risultati delle predizioni

In [24]:
# Prova di quanti libri con etichetta Fiction vengano accuratamente predetti
for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i] # Seleziona solo le descrizioni
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ["Fiction"]

100%|██████████| 300/300 [19:12<00:00,  3.84s/it]


In [25]:
# Prova sui libri NonFiction
for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ["Nonfiction"]

100%|██████████| 300/300 [20:59<00:00,  4.20s/it]


In [26]:
predictions_df = pd.DataFrame({"actual_categories": actual_cats, "predicted_categories": predicted_cats})

In [29]:
# A colpo d'occhio sembra che la predizione sia accettabile
predictions_df

Unnamed: 0,actual_categories,predicted_categories,correct_prediction
0,Fiction,Fiction,1
1,Fiction,Fiction,1
2,Fiction,Fiction,1
3,Fiction,Nonfiction,0
4,Fiction,Nonfiction,0
...,...,...,...
595,Nonfiction,Nonfiction,1
596,Nonfiction,Nonfiction,1
597,Nonfiction,Nonfiction,1
598,Nonfiction,Nonfiction,1


In [30]:
# Creazione di una colonna che mostri la corrispondenza tra predizioni e label corrette
predictions_df["correct_prediction"] = (
    np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
)

In [31]:
# Si contano in percentuale il numero di predizioni corrette
predictions_df["correct_prediction"].sum() / len(predictions_df)

# Questa percentuale è accettabile per il modello zero-shot scelto
# Convinti di questo, si può utilizzare questo modello per predirre le classi mancanti per i libri

0.7816666666666666

In [32]:
# Si selezionano, tramite identificatore "isbn13", i libri con valore nullo per la colonna "simple_categories"
isbns = []
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [38]:
# Prova per verificare se il numero di libri senza etichetta, sommato a quello dei libri già etichettati, corrisponde al dataset totale
books[~books["simple_categories"].isna()].shape[0] + missing_cats.shape[0]

5089

In [39]:
# Si estrapolano le descrizioni corrispondenti ai libri con valore mancante

for i in tqdm(range(0, len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    isbns += [missing_cats["isbn13"][i]]

100%|██████████| 928/928 [50:33<00:00,  3.27s/it]


In [40]:
# DataFrame costituito dai libri etichettati da reinserire nel dataset originale
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})

In [41]:
# Si riempono i valori delle categorie nulle con quelle appena predette
# Si fa un left join su "isbn13", perchè il dataframe è un sottoinsieme del dataset iniziale
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")

# Quando il valore in "simple_categories" è mancante, viene sostituito con il valore corrispondente dalla colonna "predicted_categories" 
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])

In [42]:
# Elimino la colonna di categorie predette ausiliari già sfruttata
books = books.drop(columns=["predicted_categories"])

In [43]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...,Nonfiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934: ""In The Problem of Pain, C.S. L...",Nonfiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5084,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222: On A Train Journey Home To Nort...,Fiction
5085,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014: This book tells the tale of a m...,Nonfiction
5086,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623: Wisdom to Create a Life of Pass...,Nonfiction
5087,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535: This collection of the timeless...,Nonfiction


In [44]:
print(books["simple_categories"].info())
print(books["simple_categories"].isna().sum())
print(books["simple_categories"].describe())

<class 'pandas.core.series.Series'>
RangeIndex: 5089 entries, 0 to 5088
Series name: simple_categories
Non-Null Count  Dtype 
--------------  ----- 
5089 non-null   object
dtypes: object(1)
memory usage: 39.9+ KB
None
0
count        5089
unique          4
top       Fiction
freq         2735
Name: simple_categories, dtype: object


In [45]:
# Classi di categorie definite
books["simple_categories"].value_counts()

simple_categories
Fiction                  2735
Nonfiction               1915
Children's Fiction        383
Children's Nonfiction      56
Name: count, dtype: int64

In [46]:
# Teoricamente, il modello zero-shot potrebbe essere riutilizzato per definire altre generi
# Nella pratica, potrebbe essere difficile capirne anche l'accuratezza nel compito di assegnazione delle classi:
# - Non ci sono abbastanza esempi di nuove classi su cui allenare adeguatamente il modello  
# - Ci sono molte etichette assegnabili ma potrebbero essere troppo specifiche

# Es. classi assegnabili comuni
books[books["categories"].str.lower().isin([
    "romance",
    "science fiction",
    "scifi",
    "fantasy",
    "horror",
    "mystery",
    "thriller", 
    "comedy", 
    "crime", 
    "historical"
])]

# Sotto questi appellativi, vi sono solo 15 esempi, troppo poco
# Per ora quindi lasciamo che le uniche classi filtrabili per la lettura da utente siano quelle 4 sopra indicate

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
24,9780006513087,0006513085,Gravity,Tess Gerritsen,Science fiction,http://books.google.com/books/content?id=KI66c...,Emma Watson a research physician has been trai...,2004.0,4.04,342.0,8024.0,Gravity,9780006513087: Emma Watson a research physicia...,Nonfiction
471,9780099410355,0099410354,Traitor,Matthew Woodring Stover,Science fiction,http://books.google.com/books/content?id=VbICO...,"From the depths of catastrophe, a glimmer of h...",2002.0,4.0,320.0,6765.0,Traitor,"9780099410355: From the depths of catastrophe,...",Fiction
474,9780099422341,0099422344,Yeats is Dead!,Joseph O'Connor,Comedy,http://books.google.com/books/content?id=DrE3I...,"In aid of Amnesty International, this is a bri...",2002.0,3.39,298.0,34.0,Yeats is Dead!: A Novel by Fifteen Irish Writers,9780099422341: In aid of Amnesty International...,Fiction
487,9780099446729,0099446723,Blackwood Farm,Anne Rice,Horror,http://books.google.com/books/content?id=cIn8T...,"Lestat Is Back, Saviour And Demon, Presiding O...",2003.0,3.86,774.0,26145.0,Blackwood Farm,"9780099446729: Lestat Is Back, Saviour And Dem...",Fiction
1396,9780340837955,0340837950,Stranger in a Strange Land,Robert A. Heinlein,Science fiction,http://books.google.com/books/content?id=ZQhiP...,"Epic, entertaining, Stranger in a Strange Land...",2005.0,3.92,672.0,563.0,Stranger in a Strange Land,"9780340837955: Epic, entertaining, Stranger in...",Fiction
1400,9780345251220,0345251229,Visions from Nowhere,William Arrow,Science fiction,,"The first novel in the series, ""Return to the ...",1976.0,3.23,183.0,10.0,Visions from Nowhere,"9780345251220: The first novel in the series, ...",Fiction
2781,9780575075597,0575075597,Replay,Ken Grimwood,Fantasy,http://books.google.com/books/content?id=9vmNP...,At forty-three Jeff Winston is tired of his lo...,2005.0,4.16,272.0,412.0,Replay,9780575075597: At forty-three Jeff Winston is ...,Fiction
2796,9780590254762,0590254766,"The lion, the witch and the wardrobe",Clive Staples Lewis,Fantasy,,Four English school children enter the magic l...,1995.0,4.21,189.0,860.0,"The lion, the witch and the wardrobe",9780590254762: Four English school children en...,Nonfiction
3213,9780739423851,0739423851,Wizard's Castle,Diana Wynne Jones,Fantasy,http://books.google.com/books/content?id=hB7hA...,Howl's moving castle - Eldest of three sisters...,2002.0,4.44,376.0,439.0,Wizard's Castle,9780739423851: Howl's moving castle - Eldest o...,Fiction
3214,9780739439708,0739439707,Time Quartet,Madeleine L'Engle,Science fiction,,"Blending magic with quantum physics, Madeleine...",2003.0,4.35,646.0,165.0,Time Quartet,9780739439708: Blending magic with quantum phy...,Fiction


In [47]:
# Si salva questa nuova versione del dataset, con correzione della feature "categories"
books.to_csv("books_with_categories.csv", index=False)