# Categorical Data

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame([
    ["green", "M", 10.1, "class1"],
    ["red", "L",13.5, "class2"],
    ["blue", "XL", 15.3, "class1"]
])
df.columns = ["color", "size", "price", "classlabel"]
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [4]:
size_mapping = {"XL": 3, "L": 2, "M": 1}
df["size"] = df["size"].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [5]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df["size"].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [6]:
class_mapping = {label:idx for idx, label in enumerate(df['classlabel'].unique())} # same as np.unique(df['classlabel'])
class_mapping

{'class1': 0, 'class2': 1}

In [7]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [8]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [9]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'])
y

array([0, 1, 0], dtype=int64)

In [10]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

In [14]:
X = df[['color', 'size', 'price']].copy()
color_le = LabelEncoder()
X.loc[:,'color'] = color_le.fit_transform(X['color'])
X

Unnamed: 0,color,size,price
0,1,1,10.1
1,2,2,13.5
2,0,3,15.3


In [15]:
color_le.fit_transform(X['color'])

array([1, 2, 0], dtype=int64)

In [18]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

In [22]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0
