In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')
import os

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/final_project/data/model_data.csv")
data.head(4)

Unnamed: 0,title,summary,genre,cleaned_summary,entities,word_count,unique_word_count
0,A Clockwork Orange,"Alex, a teenager living in near-future Englan...",science fiction,alex teenager living near future england lead ...,"[('alex', 'PERSON'), ('england', 'GPE'), ('rus...",588,416
1,The Plague,The text of The Plague is divided into five p...,literary fiction,text plague divided five part town oran thousa...,"[('five', 'CARDINAL'), ('dr bernard rieux', 'P...",609,424
2,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...",literary fiction,book tell story paul umer german soldier who u...,"[('paul umer', 'PERSON'), ('german', 'NORP'), ...",375,277
3,A Wizard of Earthsea,"Ged is a young boy on Gont, one of the larger...",fantasy,ged young boy gont one larger island north arc...,"[('gont', 'PERSON'), ('one', 'CARDINAL'), ('ar...",549,371


In [4]:
# We choose the useful columns to train the model
# We change the name of the preprocessed summaries column

selected_columns = ['genre', 'cleaned_summary']
new_data = data[selected_columns].copy()
new_data.rename(columns = {'cleaned_summary':'tokens'}, inplace = True)
new_data.head(2)

Unnamed: 0,genre,tokens
0,science fiction,alex teenager living near future england lead ...
1,literary fiction,text plague divided five part town oran thousa...


In [5]:
new_data.isna().sum()

genre     0
tokens    2
dtype: int64

In [6]:
new_data.dropna(inplace=True)

In [7]:
new_data.isna().sum()

genre     0
tokens    0
dtype: int64

In [8]:
# We're going to convert the genres into numerical values. 

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(new_data['genre'])
y_encoded

array([3, 2, 2, ..., 0, 0, 0])

In [9]:
# Again, I'm going to vectorize the values of the column 'tokens'. 

# TF-IDF vectorization for tokens
tfidf_vectorizer = TfidfVectorizer()
tokens_tfidf = tfidf_vectorizer.fit_transform(new_data['tokens'].astype(str))

# Create a DataFrame with TF-IDF vectors
features = pd.DataFrame(tokens_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [10]:
features.shape[0]

11011

In [11]:
y_encoded.shape

(11011,)

In [12]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris

# Load your dataset, or replace this with your data loading code
iris = load_iris() 
X = features
y = y_encoded


In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [14]:
# Define the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y)))

# Define the parameter grid to search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters: ", grid_search.best_params_)

KeyboardInterrupt: 

In [17]:
# Set the best parameters obtained from GridSearchCV
best_params = {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8}

# Create the XGBoost classifier with the best parameters
xgb_classifier = xgb.XGBClassifier(
    objective='multi:softmax',  
    num_class=len(set(y_train)),
    **best_params
)

In [18]:
# Train the model on the training set
xgb_classifier.fit(X_train, y_train)

In [19]:
y_pred = xgb_classifier.predict(X_test)

In [20]:
# Classification Report (provides precision, recall, f1-score, and support for each class)
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))

rf_accuracy = accuracy_score(y_test, y_pred)
print(f"\nXgboost Accuracy: {rf_accuracy}")

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Xgboost Accuracy: 1.0


In [21]:
# Evaluating metrics
accuracy_2 = accuracy_score(y_test, y_pred)
precision_2 = precision_score(y_test, y_pred, average='macro')
recall_2 = recall_score(y_test, y_pred, average='macro')
f1_2 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy_2*100, '%')
print("Precision:", precision_2*100, '%')
print("Recall:", recall_2*100, '%')
print("F1 score:", f1_2*100, '%')

Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1 score: 100.0 %


In [None]:
# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))