### 8. Feature extraction

Convert the preprocessed text into numerical features that machine learning models can understand: TF-IDF (Term Frequency-Inverse Document Frequency) to represent each book summary.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')
import os

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/final_project/data/model_data.csv")
data.head(4)

Unnamed: 0,title,summary,genre,cleaned_summary,entities,word_count,unique_word_count
0,A Clockwork Orange,"Alex, a teenager living in near-future Englan...",science fiction,alex teenager living near future england lead ...,"[('alex', 'PERSON'), ('england', 'GPE'), ('rus...",588,416
1,The Plague,The text of The Plague is divided into five p...,literary fiction,text plague divided five part town oran thousa...,"[('five', 'CARDINAL'), ('dr bernard rieux', 'P...",609,424
2,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...",literary fiction,book tell story paul umer german soldier who u...,"[('paul umer', 'PERSON'), ('german', 'NORP'), ...",375,277
3,A Wizard of Earthsea,"Ged is a young boy on Gont, one of the larger...",fantasy,ged young boy gont one larger island north arc...,"[('gont', 'PERSON'), ('one', 'CARDINAL'), ('ar...",549,371


In [4]:
# I'm going to create a function to modify the column of "entities" so that in every row I have the types of entities that appear but not the specific entities.

import ast

# Function to extract the second element from each tuple in the entities string
def extract_element(entities_str):
    # Convert the entities string to a list of tuples
    entities_list = ast.literal_eval(entities_str)
        
    # Check if the list is empty
    if not entities_list:
        return []
        
    # Extract the second element from each tuple
    second_elements = [entity[1] for entity in entities_list]
        
    return second_elements

In [5]:
data['entities'] = data['entities'].apply(extract_element)
data.sample(5)

Unnamed: 0,title,summary,genre,cleaned_summary,entities,word_count,unique_word_count
4434,Gallows Hill,Sarah Zoltanne is an extra ordinary girl. Her...,literary fiction,sarah zoltanne extra ordinary girl widowed mot...,"[PERSON, PERSON, PERSON, PERSON, CARDINAL, PER...",258,172
10264,Baby Teeth,Meet Hanna.\n\nShe’s the sweet-but-silent ange...,thriller,meet hanna sweet but silent angel adoring eye ...,"[PERSON, DATE, DATE, PERSON]",67,59
1135,The Cruel Sea,The action commences in 1939. Lieutenant-Comm...,literary fiction,action commences lieutenant commander george e...,"[PERSON, ORG, ORG, CARDINAL, PERSON, PERSON, O...",237,167
1289,The Basic Eight,Flannery Culp is a senior at Roewer High Scho...,literary fiction,flannery culp senior roewer high school san fr...,"[ORG, ORG, GPE, CARDINAL, CARDINAL, PERSON, DA...",143,117
600,The Once and Future King,"Most of the book ""takes place on the isle of ...",fantasy,book take place isle gramarye chronicle raisin...,"[ORG, PERSON, PERSON, ORG, PERSON, PERSON, PER...",121,90


In [7]:
# Now I'm going to convert the lists of entity categories in the column of "entities" into strings of tokens separated by spaces.
# In this way, we'll have the same format in the columns of "cleaned_summary" and "entities".


# Function to convert a list of strings to lowercase and join them
def convert_entities_to_string(entities_list):
    return ' '.join(entity.lower() for entity in entities_list)

In [8]:
data['entities'] = data['entities'].apply(convert_entities_to_string)
data.sample(6)

Unnamed: 0,title,summary,genre,cleaned_summary,entities,word_count,unique_word_count
10007,Necropolis: City of the Dead,Scarlett Adams' school tutor group is taken o...,thriller,scarlett adam school tutor group taken trip st...,person person person person cardinal cardinal ...,1001,555
2538,The Land of Crimson Clouds,"A spaceship, propelled by a prototype photon ...",science fiction,spaceship propelled prototype photon engine se...,org ordinal cardinal date person gpe person pe...,81,72
3079,I Left My Sneakers in Dimension X,I Left My Sneakers in Dimension X continues t...,science fiction,left sneaker dimension continues adventure rod...,org person person cardinal cardinal gpe org pe...,340,218
1743,Clarissa,"Clarissa Harlowe, the tragic heroine of Clari...",literary fiction,clarissa harlowe tragic heroine clarissa beaut...,org product org person org person person perso...,399,266
1080,The Redbreast,The novel begins with a reference to a fable ...,thriller,novel begin reference fable robin first got re...,cardinal cardinal norp norp norp date cardinal...,842,507
9169,Undead and Unfinished,"Satan, wants her daughter Laura (Betsy's half...",thriller,satan want daughter laura betsy half sister an...,person cardinal person person person person pe...,57,42


In [9]:
selected_columns = ['cleaned_summary', 'entities', 'genre']
before_vect = data[selected_columns].copy()
before_vect

Unnamed: 0,cleaned_summary,entities,genre
0,alex teenager living near future england lead ...,person gpe norp person ordinal person person p...,science fiction
1,text plague divided five part town oran thousa...,cardinal person person person person cardinal ...,literary fiction
2,book tell story paul umer german soldier who u...,person norp norp person org product person per...,literary fiction
3,ged young boy gont one larger island north arc...,person cardinal loc date org cardinal date car...,fantasy
4,living mar deckard acting consultant movie cre...,person,science fiction
...,...,...,...
11008,atticus sullivan last druid life peacefully ar...,person gpe norp norp date date norp norp norp org,fantasy
11009,charlie bucket wonderful adventure begin find ...,person cardinal,fantasy
11010,live dream child born free say like land fathe...,gpe date gpe gpe cardinal,fantasy
11011,rose love dimitri dimitri might love tasha mas...,person date person person person norp date dat...,fantasy


In [13]:
before_vect.rename(columns = {'cleaned_summary':'tokens'}, inplace = True)
before_vect.sample(2)

Unnamed: 0,tokens,entities,genre
6812,rachel leah follows story jacob eye rachel lea...,person person person,historical novel
4036,gordon reeve former sa soldier receives phone ...,person org gpe person gpe person person person...,thriller


we're using TF-IDF for the "cleaned_summary" tokens and CountVectorizer for the "entities" column in the context of training a machine learning model to predict book genres.

TF-IDF for Cleaned Summary:

Explanation:
TF-IDF (Term Frequency-Inverse Document Frequency) is a technique commonly used in natural language processing to convert a collection of text documents into numerical features. It emphasizes the importance of words by considering both their frequency in a document (term frequency) and their rarity across the entire dataset (inverse document frequency).
In the "cleaned_summary" column, each row represents a summary of a book. TF-IDF helps capture the significance of individual words within each summary and assigns higher weights to words that are frequent in a specific summary but not across all summaries.
Why TF-IDF:
Summaries often contain words that are indicative of the genre or theme of a book. By using TF-IDF, we can represent each summary as a vector of numerical features that highlight important terms while downplaying common words that may not carry genre-specific information.
CountVectorizer for Entities:

Explanation:
The "entities" column contains information about different types of entities (e.g., 'PERSON', 'ORG', 'DATE') mentioned in each book. These entities can provide valuable information about the content and themes of the book.
CountVectorizer is used to convert the entity types into numerical features by counting the occurrences of each unique entity type in each row. This results in a matrix where each column represents a unique entity type, and the values indicate the count of occurrences for each entity type in a given book.
Why CountVectorizer:
Counting the occurrences of entity types allows us to capture the prevalence of different types of entities in each book. The resulting feature matrix can help the model learn patterns related to the distribution of entity types, which might be indicative of the genre.
Concatenation of Features:

Explanation:
After obtaining TF-IDF features for the cleaned summaries and count-based features for the entities, we concatenate these features into a single feature matrix. This combined matrix is used as input to train a machine learning model for predicting book genres.
Why Concatenation:
Combining the TF-IDF features with entity count features allows the model to leverage information from both textual content and entity occurrences. This can potentially enhance the model's ability to capture diverse aspects of the book content, leading to a more robust genre prediction.

In [17]:

# TF-IDF vectorization for cleaned_summary
tfidf_vectorizer = TfidfVectorizer()
tokens_tfidf = tfidf_vectorizer.fit_transform(before_vect['tokens'].astype(str))

# CountVectorizer for entities
entities_vectorizer = CountVectorizer()
entities_matrix = entities_vectorizer.fit_transform(before_vect['entities'].astype(str))

# Concatenate TF-IDF and entities features
features = pd.concat([pd.DataFrame(tokens_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()),
                     pd.DataFrame(entities_matrix.toarray(), columns=entities_vectorizer.get_feature_names_out())], axis=1)

### 9. Train_test split

In [None]:
# Split the data into training and testing sets

X = features
y = before_vect['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### 10. Choose the three models

First of all I'll try two scikit-learn models for classification:
    - RandomForest classifier
    - Xgboost classifier

Afterwards, if time, perhaps I'll try to train a deep learning model.

In [None]:
# RandomForest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"RandomForest Accuracy: {rf_accuracy}")

# XGBoost Model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy}")