In [56]:
from sklearn import preprocessing
import numpy as np

enc = preprocessing.OneHotEncoder()
X = np.array([['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']])
X

array([['male', 'from US', 'uses Safari'],
       ['female', 'from Europe', 'uses Firefox']], dtype='<U12')

In [57]:
'''
X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
'''
enc.fit(X)

OneHotEncoder()

In [58]:
enc.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()

array([[1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1.]])

In [59]:
enc.categories_ # 默认情况下,从数据集中自动推断出来
# 解析
# female的编码是1, 0
# male的编码是0, 1
# from Europe的编码是1, 0
# from US的编码是0, 1
# users Firefox的编码是1, 0
# users Safari的编码是0, 1

[array(['female', 'male'], dtype='<U12'),
 array(['from Europe', 'from US'], dtype='<U12'),
 array(['uses Firefox', 'uses Safari'], dtype='<U12')]

In [60]:
genders = ['male', 'female']
locations = ['from Africa', 'from Asia', 'from US' , 'from Europe']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc1 = preprocessing.OneHotEncoder(categories=[genders, locations, browsers]) # ★★★★★使用参数明确指定categories,若数据类型为字符串,则必须指定X1的数据类型的object.

In [61]:
X1 = np.array([['male', 'from US', 'uses Safari'],
               ['female', 'from Europe', 'uses Firefox']],
              dtype=object) # 指定X1的数据类型为object
enc1.fit(X1)

OneHotEncoder(categories=[['male', 'female'],
                          ['from Africa', 'from Asia', 'from US',
                           'from Europe'],
                          ['uses Chrome', 'uses Firefox', 'uses IE',
                           'uses Safari']])

In [62]:
enc1.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()

array([[0., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 0., 1.]])

In [63]:
enc1.categories_
# 解析
# female的编码是0, 1
# male的编码是1, 0
# from Africal的编码是1, 0, 0, 0
# from Asia的编码是0, 1, 0, 0
# from Europe的编码是0, 0, 1, 0
# from US的编码是0, 0, 1, 0
# user Chrome的编码是0, 0, 0, 1
# users Firefox的编码是0, 1, 0, 0
# users IE的编码是0, 0, 1, 0
# users Safari的编码是0, 0, 0, 1

[array(['male', 'female'], dtype=object),
 array(['from Africa', 'from Asia', 'from US', 'from Europe'], dtype=object),
 array(['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari'],
       dtype=object)]

In [64]:
enc2 = preprocessing.OneHotEncoder(handle_unknown='ignore') # 如果训练数据有可能缺少分类特征,则通常最好指定handle_unknown='ignore',否则将报错
X2 = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc2.fit(X)

# 不含from Asia, uses Chrome,故其编码为0, 0;0, 0
enc_output = enc2.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
enc_output

array([[1., 0., 0., 0., 0., 0.]])

In [65]:
enc2.categories_

[array(['female', 'male'], dtype='<U12'),
 array(['from Europe', 'from US'], dtype='<U12'),
 array(['uses Firefox', 'uses Safari'], dtype='<U12')]

In [66]:
enc2.inverse_transform(enc_output) # 将数据转换回原始表示形式

array([['female', None, None]], dtype=object)