# Creando la matriz $\boldsymbol{X}$ y el vector $\boldsymbol{y}$ 

<img src="https://raw.githubusercontent.com/fhernanb/fhernanb.github.io/master/my_docs/logo_unal_color.png" alt="drawing" width="200"/>

Las explicaciones mostradas aquí están basadas en los ejemplos de:

https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

In [1]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Datos

Vamos a crear un dataframe con unas pocas filas para obtener la matriz $\boldsymbol{X}$ y el vector $\boldsymbol{y}$ 

In [2]:
# Creando el diccionario
data = {'nombre'   : ['tom', 'nick', 'julia', 'marta'],
        'sexo'     : ['hombre', 'hombre', 'mujer', 'mujer'],
        'edad'     : [10, 15, 14, 20],
        'barrio'   : ['Alcala', 'Villas', 'Alcala', 'Provenza'],
        'grupo'    : ['g2', 'g3', 'g1', 'g1'],
        'estatura' : [156, 174, 169, 180],
        'peso'     : [67, 76, 75, 79],
        'deporte'  : ['nunca', 'a veces', 'a veces', 'siempre']}
  
# Creando el df
df = pd.DataFrame(data)

# Para mostrar el df
df

Unnamed: 0,nombre,sexo,edad,barrio,grupo,estatura,peso,deporte
0,tom,hombre,10,Alcala,g2,156,67,nunca
1,nick,hombre,15,Villas,g3,174,76,a veces
2,julia,mujer,14,Alcala,g1,169,75,a veces
3,marta,mujer,20,Provenza,g1,180,79,siempre


# Ejemplo 1 con Ordinal Encoder

Crear la matriz $\boldsymbol{X}$ usando como variables explicativas `sexo` y `grupo`.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

In [3]:
X = df[["sexo", "grupo"]]

enc = OrdinalEncoder()
enc = enc.fit(X)
X = enc.transform(X)

print("La matriz X es:")
print(X)
print("\n")
print("Los datos originales son:")
print(df[["sexo", "grupo"]])

La matriz X es:
[[0. 1.]
 [0. 2.]
 [1. 0.]
 [1. 0.]]


Los datos originales son:
     sexo grupo
0  hombre    g2
1  hombre    g3
2   mujer    g1
3   mujer    g1


Vamos ahora a realizar el proceso inverso. Vamos a convertir el arreglo `[[1, 0], [0, 2], [0, 1]]` en las etiquetas originales.

In [4]:
enc.inverse_transform([[1, 0],
                       [0, 2],
                       [0, 1]])

array([['mujer', 'g1'],
       ['hombre', 'g3'],
       ['hombre', 'g2']], dtype=object)

# Ejemplo 2 con One Hot Encoding

Crear la matriz $\boldsymbol{X}$ usando como variables explicativas `sexo` y `grupo`.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [5]:
X = df[["sexo", "grupo"]]

enc = OneHotEncoder()
enc = enc.fit(X)
X = enc.transform(X)

print("La matriz X es:")
print(X)
print("\n")
print("Los datos originales son:")
print(df[["sexo", "grupo"]])

La matriz X es:
  (0, 0)	1.0
  (0, 3)	1.0
  (1, 0)	1.0
  (1, 4)	1.0
  (2, 1)	1.0
  (2, 2)	1.0
  (3, 1)	1.0
  (3, 2)	1.0


Los datos originales son:
     sexo grupo
0  hombre    g2
1  hombre    g3
2   mujer    g1
3   mujer    g1


# Ejemplo 3 Dummy Variable Encoding

Crear la matriz $\boldsymbol{X}$ usando como variables explicativas `sexo` y `grupo`.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

In [6]:
X = df[["sexo", "grupo"]]

enc = OneHotEncoder(drop="first")
enc = enc.fit(X)
X = enc.transform(X)

print("La matriz X es:")
print(X)
print("\n")
print("Los datos originales son:")
print(df[["sexo", "grupo"]])

La matriz X es:
  (0, 1)	1.0
  (1, 2)	1.0
  (2, 0)	1.0
  (3, 0)	1.0


Los datos originales son:
     sexo grupo
0  hombre    g2
1  hombre    g3
2   mujer    g1
3   mujer    g1


# Ejemplo 4

Crear el vector $\boldsymbol{y}$ suponiendo como variable respuesta `deporte`.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [7]:
y = df["deporte"]

le = LabelEncoder()
le = le.fit(y)
y = le.transform(y)

print("El vector y es:")
print(y)
print("\n")
print("Los datos originales son:")
print(df["deporte"])

El vector y es:
[1 0 0 2]


Los datos originales son:
0      nunca
1    a veces
2    a veces
3    siempre
Name: deporte, dtype: object


# Ejemplo de Jason Brownlee

https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

In [8]:
# evaluate logistic regression on the breast cancer dataset with an ordinal encoding
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 75.79


In [9]:
# evaluate logistic regression on the breast cancer dataset with an one-hot encoding
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# one-hot encode input variables
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 70.53
