### Tratamiento de variables categóricas no ordenadas

In [1]:
import pandas as pd
pd.options.display.max_columns = None # el número de columnas mostradas no tiene límite
from io import StringIO

In [2]:
# Defino en un string los datos para el ejemplo:

datos = """age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
"""

In [3]:
# A través de StringIO, leo los datos de un string como si estuviera leyendo un archivo csv:
df = pd.read_csv(StringIO(datos))
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [4]:
df.shape

(10, 15)

Vemos que hay 15 variables en total.

In [5]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object

Como vemos, hay 9 variables de tipo object.

In [6]:
# Dicotomización ("one hot encoding"):
# Conversión de cada variable object a tantas variables binarias ("dummies") como valores diferentes tenga.

df_con_dummies = pd.get_dummies(df)
df_con_dummies

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Private,workclass_Self-emp-not-inc,workclass_State-gov,education_11th,education_9th,education_Bachelors,education_HS-grad,education_Masters,marital-status_Divorced,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,occupation_Adm-clerical,occupation_Exec-managerial,occupation_Handlers-cleaners,occupation_Other-service,occupation_Prof-specialty,relationship_Husband,relationship_Not-in-family,relationship_Wife,race_Black,race_White,sex_Female,sex_Male,native-country_Cuba,native-country_Jamaica,native-country_United-States,class_<=50K,class_>50K
0,39,77516,13,2174,0,40,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0
1,50,83311,13,0,0,13,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0
2,38,215646,9,0,0,40,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,0
3,53,234721,7,0,0,40,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0
4,28,338409,13,0,0,40,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0
5,37,284582,14,0,0,40,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,1,1,0
6,49,160187,5,0,0,16,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0
7,52,209642,9,0,0,45,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1
8,31,45781,14,14084,0,50,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,1
9,42,159449,13,5178,0,40,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1


In [7]:
df_con_dummies.dtypes

age                                     int64
fnlwgt                                  int64
education-num                           int64
capital-gain                            int64
capital-loss                            int64
hours-per-week                          int64
workclass_Private                       uint8
workclass_Self-emp-not-inc              uint8
workclass_State-gov                     uint8
education_11th                          uint8
education_9th                           uint8
education_Bachelors                     uint8
education_HS-grad                       uint8
education_Masters                       uint8
marital-status_Divorced                 uint8
marital-status_Married-civ-spouse       uint8
marital-status_Married-spouse-absent    uint8
marital-status_Never-married            uint8
occupation_Adm-clerical                 uint8
occupation_Exec-managerial              uint8
occupation_Handlers-cleaners            uint8
occupation_Other-service          

In [8]:
df_con_dummies.shape

(10, 35)

Como vemos, el get_dummies crea una versión en la que no hay columnas object, y hay más columnas que antes debido a los dummies.

In [9]:
# Segunda versión de la dicotomización:
# Si una variable object tiene k valores diferentes, el get_dummies genera k dummies, pero hay redundancia
# ya que siempre suman 1, con lo cual se puede eliminar uno de ellos.
# La opción drop_first=True elimina el dummy asociado al primer dummy:

df_con_dummies2 = pd.get_dummies(df, drop_first=True)
df_con_dummies2

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Self-emp-not-inc,workclass_State-gov,education_9th,education_Bachelors,education_HS-grad,education_Masters,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,occupation_Exec-managerial,occupation_Handlers-cleaners,occupation_Other-service,occupation_Prof-specialty,relationship_Not-in-family,relationship_Wife,race_White,sex_Male,native-country_Jamaica,native-country_United-States,class_>50K
0,39,77516,13,2174,0,40,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1,0,1,0
1,50,83311,13,0,0,13,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0
2,38,215646,9,0,0,40,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0
3,53,234721,7,0,0,40,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0
4,28,338409,13,0,0,40,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0
5,37,284582,14,0,0,40,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,1,0
6,49,160187,5,0,0,16,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0
7,52,209642,9,0,0,45,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1
8,31,45781,14,14084,0,50,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1
9,42,159449,13,5178,0,40,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1


In [10]:
df_con_dummies2.shape

(10, 26)

Vemos que hay 9 columnas menos que en el anterior get_dummies (una por cada una de las 9 columnas object).

In [11]:
# Si hay solo algunas variables object que queremos dicotomizar:

df_con_dummies3 = pd.get_dummies(df, drop_first=True, columns=["education", "workclass"])
df_con_dummies3

Unnamed: 0,age,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class,education_9th,education_Bachelors,education_HS-grad,education_Masters,workclass_Self-emp-not-inc,workclass_State-gov
0,39,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,1,0,0,0,1
1,50,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,1,0,0,1,0
2,38,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,1,0,0,0
3,53,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,0,0,0
4,28,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,1,0,0,0,0
5,37,284582,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0
6,49,160187,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,1,0,0,0,0,0
7,52,209642,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,0,0,1,0,1,0
8,31,45781,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,0,0,0,1,0,0
9,42,159449,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,0,1,0,0,0,0


In [12]:
df_con_dummies3.dtypes

age                            int64
fnlwgt                         int64
education-num                  int64
marital-status                object
occupation                    object
relationship                  object
race                          object
sex                           object
capital-gain                   int64
capital-loss                   int64
hours-per-week                 int64
native-country                object
class                         object
education_9th                  uint8
education_Bachelors            uint8
education_HS-grad              uint8
education_Masters              uint8
workclass_Self-emp-not-inc     uint8
workclass_State-gov            uint8
dtype: object

In [13]:
df_con_dummies3.shape

(10, 19)

Vemos que se han pasado a dummies solo las variables education y workclass.