### データセットの用意

In [1]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [2]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

### sklearnによる平均値補完

In [27]:
# from sklearn.impute import SimpleImputer
from sklearn.impute import SimpleImputer

# strategy:meanで平均、medianで中央値、most_frequentで最頻値
imr = SimpleImputer(strategy='mean')
# df.valuesでarray型に変換できる
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

### 順序特徴量・名義特徴量

In [231]:
# データセットの用意
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


### 順序特徴量のエンコーディング

In [232]:
# サイズのマッピング
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

In [233]:
# 変換
df1 = df
df1['size'] = df['size'].map(size_mapping)
df1

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [234]:
# 逆のマッピング
df2 = df1
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df2['size'] = df2['size'].map(inv_size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [235]:
# 再度変換
df2['size'] = df['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


### 名義特徴量のエンコーディング

In [223]:
# クラスラベルのマッピング
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [236]:
# 変換
df4 = df2
df4['classlabel'] = df2['classlabel'].map(class_mapping)
df4

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [237]:
# 逆変換
df2 = df
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df2['classlabel'] = df['classlabel'].map(inv_class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [239]:
# 再変換
df4['classlabel'] = df2['classlabel'].map(class_mapping)
df4

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


### 名義特徴量のsklearnによる変換

In [229]:
# dfをnparray型に変換
df4 = df
df4['size'] = df['size'].map(size_mapping)
df4['classlabel'] = df['classlabel'].map(class_mapping)
df4
X = df1[['color', 'size', 'price']].values
X1 = X
X

array([['green', nan, 10.1],
       ['red', nan, 13.5],
       ['blue', nan, 15.3]], dtype=object)

In [89]:
# sklaernで変換
from sklearn.preprocessing import LabelEncoder
color_le = LabelEncoder()
X1[:, 0] = color_le.fit_transform(X1[:, 0])
X1

array([[1, nan, 10.1],
       [2, nan, 13.5],
       [0, nan, 15.3]], dtype=object)

##### ただしこれだとカテゴリ間に順序が生まれてしまうため、あまりよろしくない
→one-hotエンコーディングを利用する

In [90]:
X1

array([[1, nan, 10.1],
       [2, nan, 13.5],
       [0, nan, 15.3]], dtype=object)

In [77]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X1).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [81]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

X1 = np.array(ct.fit_transform(X1), dtype=np.float)
X1

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  1. ,  0. ,  0. ,  3. , 15.3]])

In [83]:
from sklearn.compose import ColumnTransformer 
ct = ColumnTransformer([("Name_Of_Your_Step", OneHotEncoder(),[0])], remainder="passthrough") # The last arg ([0]) is the list of columns you want to transform in this step
ct.fit_transform(X1)    

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 1. ,  0. ,  1. ,  0. ,  1. ,  0. ,  1. , 10.1],
       [ 1. ,  0. ,  1. ,  0. ,  0. ,  1. ,  2. , 13.5],
       [ 0. ,  1. ,  0. ,  1. ,  0. ,  0. ,  3. , 15.3]])

In [86]:
onehotencoder = OneHotEncoder(categorical_features=[0], handle_unknown='ignore')

onehotencoder.fit_transform(X1).toarray()



TypeError: Wrong type for parameter `n_values`. Expected 'auto', int or array of ints, got <class 'numpy.ndarray'>

In [87]:
X1

array([[ 0. ,  1. ,  0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  1. ,  0. ,  0. ,  3. , 15.3]])