#Categorical data encoding


##1. Import libraries and load dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
df = pd.read_csv('DatasetLibros.csv')
df

Unnamed: 0,libro,autor,editorial,anio_publicacion,num_paginas,formato,categoria,rating,popularidad_libro
0,Aplicaciones modernas de Ambiental,Esteban Aguilar,Alfaomega,2001,294,Físico,Ambiental,3.01,2
1,Sistematizacion de Ciencias de la salud,Carlos Pérez,Cambridge Press,2022,592,Digital,Ciencias de la salud,1.25,1
2,Conocimientos de Ambiental,Nuria Alvarez,Springer,2001,759,Físico,Ambiental,4.14,6
3,Sistematizacion de Filosofía,Sergio Gomez,Alfaomega,2015,712,Físico,Filosofía,1.37,10
4,Pilares de la ingeniería Mecanica,Carlos Javier,McGraw-Hill,2012,328,Físico,Mecanica,1.24,1
...,...,...,...,...,...,...,...,...,...
2631,Procesos de la ingeniería Artes,Ricardo Gualán,Springer,2014,740,Físico,Artes,4.84,5
2632,Pilares de la ingeniería Industrial,Nuria Alvarez,McGraw-Hill,2004,546,Físico,Industrial,1.47,9
2633,Principios de Mecanica,Alfons Gonzalez,Alfaomega,2011,676,Digital,Mecanica,1.10,5
2634,Fundamentos de Derecho,María Torres,McGraw-Hill,2015,510,Físico,Derecho,1.96,9


-------------
##2. One-hot encoding

Categorical columns to be transformed

In [4]:
cat_cols = ["autor", "editorial", "formato", "categoria", "libro"]

In [5]:
df_ohe = pd.get_dummies(df, columns=cat_cols, drop_first=True)

-------------
##3. Label encoding

In [6]:
df_le = df.copy()
for col in cat_cols:
    le = LabelEncoder()
    df_le[col] = le.fit_transform(df_le[col])



-------------
##4. Target encoding


In [8]:
df_te = df.copy()
for col in cat_cols:
    means = df.groupby(col)["rating"].mean()
    df_te[col] = df_te[col].map(means)



-------------
##5.1. Training without techniques


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

Use original dataset

In [9]:
df_original = df.copy()

Select only numeric columns

In [10]:
numeric_cols = df_original.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != "rating"]

X_no_encoding = df_original[numeric_cols]
y_no_encoding = df_original["rating"]

In [11]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(
    X_no_encoding, y_no_encoding, test_size=0.2, random_state=42)

model_A = LinearRegression()
model_A.fit(X_train_A, y_train_A)

pred_A = model_A.predict(X_test_A)

In [None]:
print("R²:", r2_score(y_test_A, pred_A))
print("MSE:", mean_squared_error(y_test_A, pred_A))
print("RMSE:", np.sqrt(mean_squared_error(y_test_A, pred_A)))



-------------
##5.2. Training with techniques - Using one-hot encoding and feature scalling


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

Use encoded dataset

In [13]:
df_encoded = df_ohe.copy()

In [14]:
X_B = df_encoded.drop(columns=["rating"])
y_B = df_encoded["rating"]

pipeline_B = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(
    X_B, y_B, test_size=0.2, random_state=42)

pipeline_B.fit(X_train_B, y_train_B)

pred_B = pipeline_B.predict(X_test_B)