### one hot encode

For example, imagine we have a “color” variable with three categories (‘red‘, ‘green‘, and ‘blue‘). In this case, three binary variables are needed. A “1” value is placed in the binary variable for the color and “0” values for the other colors.

red,	green,	blue
1,		0,		0
0,		1,		0
0,		0,		1

In [1]:
# one hot encode the breast cancer dataset
from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder

In [2]:
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

In [3]:
# load the dataset
dataset = read_csv(url, header=None)

In [4]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [5]:
dataset[1].unique()

array(["'premeno'", "'ge40'", "'lt40'"], dtype=object)

In [6]:
# retrieve the array of data
data = dataset.values

In [7]:
data.shape

(286, 10)

In [8]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [9]:
# summarize the raw data
print(X[:3, :])

[["'40-49'" "'premeno'" "'15-19'" "'0-2'" "'yes'" "'3'" "'right'"
  "'left_up'" "'no'"]
 ["'50-59'" "'ge40'" "'15-19'" "'0-2'" "'no'" "'1'" "'right'" "'central'"
  "'no'"]
 ["'50-59'" "'ge40'" "'35-39'" "'0-2'" "'no'" "'2'" "'left'" "'left_low'"
  "'no'"]]


In [10]:
X.shape

(286, 9)

In [26]:
# define the one hot encoding transform
encoder = OneHotEncoder(sparse=False)
# encoder = OneHotEncoder(sparse=True)

In [27]:
encoder

OneHotEncoder(sparse=False)

In [28]:
# fit and apply the transform to the input data
X_oe = encoder.fit_transform(X)

In [29]:
X_oe.shape

(286, 43)

In [32]:
X[:3,:]

array([["'40-49'", "'premeno'", "'15-19'", "'0-2'", "'yes'", "'3'",
        "'right'", "'left_up'", "'no'"],
       ["'50-59'", "'ge40'", "'15-19'", "'0-2'", "'no'", "'1'",
        "'right'", "'central'", "'no'"],
       ["'50-59'", "'ge40'", "'35-39'", "'0-2'", "'no'", "'2'", "'left'",
        "'left_low'", "'no'"]], dtype='<U11')

In [31]:
# summarize tratransformed data
print(X_oe[:3, :])

[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]


In [34]:
dataset.iloc[0,:].unique()

array(["'40-49'", "'premeno'", "'15-19'", "'0-2'", "'yes'", "'3'",
       "'right'", "'left_up'", "'no'", "'recurrence-events'"],
      dtype=object)

In [16]:
dataset[0].unique()

array(["'40-49'", "'50-59'", "'60-69'", "'30-39'", "'70-79'", "'20-29'"],
      dtype=object)

In [17]:
encoder = OneHotEncoder() # OneHotEncoder(sparse=True)

In [18]:
# fit and apply the transform to the input data
X_oe = encoder.fit_transform(X)

In [19]:
X_oe.shape

(286, 43)

In [20]:
X_oe[0:7,0]

<7x1 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [21]:
# summarize tratransformed data
print(X_oe[:1, :])

  (0, 2)	1.0
  (0, 8)	1.0
  (0, 11)	1.0
  (0, 20)	1.0
  (0, 28)	1.0
  (0, 32)	1.0
  (0, 34)	1.0
  (0, 37)	1.0
  (0, 41)	1.0


In [None]:
[0,0,0,0,1,0] -> 2

In [None]:
[1,0,0] -> 8

In [22]:
import pandas as pd

In [23]:
fruit = pd.DataFrame({'name':['apple', 'banana', 'cherry', 'durian'],
                      'color':['red', 'yellow', 'red', 'green']})   
fruit

Unnamed: 0,name,color
0,apple,red
1,banana,yellow
2,cherry,red
3,durian,green


In [24]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(fruit['name'])
fruit['name'] = le.transform(fruit['name'])
fruit

Unnamed: 0,name,color
0,0,red
1,1,yellow
2,2,red
3,3,green
