# データ準備

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame([
        ['Aさん', 100, 'S', 'cola'],
        ['Bさん', 150, 'M', 'tea'],
        ['Cさん', 200, 'L', 'coffee'],
        ['Dさん', 100, 'S', 'tea'],
        ['Eさん', 200, 'L', 'coffee'],
        ['Fさん', 200, 'L', 'tea'],
        ['Gさん', 150, 'M', 'tea'],
        ['Hさん', 200, 'L', 'coffee'],
        ['Iさん', 100, 'S', 'cola'],
        ['Jさん', 200, 'L', 'tea']],
        columns=['user', 'price', 'size', 'drink'])
df

Unnamed: 0,user,price,size,drink
0,Aさん,100,S,cola
1,Bさん,150,M,tea
2,Cさん,200,L,coffee
3,Dさん,100,S,tea
4,Eさん,200,L,coffee
5,Fさん,200,L,tea
6,Gさん,150,M,tea
7,Hさん,200,L,coffee
8,Iさん,100,S,cola
9,Jさん,200,L,tea


# 名義尺度
## get_dummies

In [3]:
df_gd = pd.get_dummies(df['drink'])
df_gd

Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,1,0,0
8,0,1,0
9,0,0,1


In [4]:
df_dummy = pd.concat([df, df_gd], axis=1)
df_dummy

Unnamed: 0,user,price,size,drink,coffee,cola,tea
0,Aさん,100,S,cola,0,1,0
1,Bさん,150,M,tea,0,0,1
2,Cさん,200,L,coffee,1,0,0
3,Dさん,100,S,tea,0,0,1
4,Eさん,200,L,coffee,1,0,0
5,Fさん,200,L,tea,0,0,1
6,Gさん,150,M,tea,0,0,1
7,Hさん,200,L,coffee,1,0,0
8,Iさん,100,S,cola,0,1,0
9,Jさん,200,L,tea,0,0,1


## OneHotEncoder

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
oenc =  OneHotEncoder(sparse=False, dtype=int)
oenc.fit(df[['drink']])
oenc_vec = oenc.transform(df[['drink']])
print(type(oenc_vec))
oenc_vec

<class 'numpy.ndarray'>


array([[0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [7]:
df_oenc = pd.DataFrame(oenc_vec, columns=['coffee', 'cola', 'tea'])
df_oenc

Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,1,0,0
8,0,1,0
9,0,0,1


In [8]:
df_oht =  pd.concat([df, df_oenc], axis=1)
df_oht

Unnamed: 0,user,price,size,drink,coffee,cola,tea
0,Aさん,100,S,cola,0,1,0
1,Bさん,150,M,tea,0,0,1
2,Cさん,200,L,coffee,1,0,0
3,Dさん,100,S,tea,0,0,1
4,Eさん,200,L,coffee,1,0,0
5,Fさん,200,L,tea,0,0,1
6,Gさん,150,M,tea,0,0,1
7,Hさん,200,L,coffee,1,0,0
8,Iさん,100,S,cola,0,1,0
9,Jさん,200,L,tea,0,0,1


## LabelBinarizer

In [9]:
from sklearn.preprocessing import LabelBinarizer

In [10]:
lbnr = LabelBinarizer()
lbnr.fit(df[['drink']])
df_lbnr =  pd.concat([df, pd.DataFrame(lbnr.transform(df[['drink']]), columns=['coffee', 'cola', 'tea'])], axis=1)
df_lbnr

Unnamed: 0,user,price,size,drink,coffee,cola,tea
0,Aさん,100,S,cola,0,1,0
1,Bさん,150,M,tea,0,0,1
2,Cさん,200,L,coffee,1,0,0
3,Dさん,100,S,tea,0,0,1
4,Eさん,200,L,coffee,1,0,0
5,Fさん,200,L,tea,0,0,1
6,Gさん,150,M,tea,0,0,1
7,Hさん,200,L,coffee,1,0,0
8,Iさん,100,S,cola,0,1,0
9,Jさん,200,L,tea,0,0,1


## 違い

In [27]:
import numpy as np
enc = OneHotEncoder(sparse=False)
df['nan_and_inf'] = ['A', 'A', 'A', 'A', 'A', 'A', np.nan, np.inf, 'B', 'B']
df

Unnamed: 0,user,price,size,drink,nan_and_inf
0,Aさん,100,S,cola,A
1,Bさん,150,M,tea,A
2,Cさん,200,L,coffee,A
3,Dさん,100,S,tea,A
4,Eさん,200,L,coffee,A
5,Fさん,200,L,tea,A
6,Gさん,150,M,tea,
7,Hさん,200,L,coffee,inf
8,Iさん,100,S,cola,B
9,Jさん,200,L,tea,B


In [28]:
onehot_X = enc.fit_transform(df[['nan_and_inf']]) # ここでエラーが出る

ValueError: Input contains NaN

In [25]:
pd.get_dummies(df['nan_and_inf'])

Unnamed: 0,inf,A,B
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,0,1,0
6,0,0,0
7,1,0,0
8,0,0,1
9,0,0,1


In [11]:
train = df.copy()
test = pd.DataFrame([
        ['Kさん', 200, 'L', 'cola'],
        ['Lさん', 100, 'S', 'tea'],
        ['Mさん', 150, 'M', 'coffee'],
        ['Nさん', 150, 'LL', 'cider']],
        columns=['user', 'price', 'size', 'drink'])
test

Unnamed: 0,user,price,size,drink
0,Kさん,200,L,cola
1,Lさん,100,S,tea
2,Mさん,150,M,coffee
3,Nさん,150,LL,cider


In [12]:
train_gd = pd.get_dummies(train['drink'])
test_gd = pd.get_dummies(test['drink'])

train_df = pd.concat([train, train_gd], axis=1)
test_df = pd.concat([test, test_gd], axis=1)

display(train_df)
display(test_df)

Unnamed: 0,user,price,size,drink,coffee,cola,tea
0,Aさん,100,S,cola,0,1,0
1,Bさん,150,M,tea,0,0,1
2,Cさん,200,L,coffee,1,0,0
3,Dさん,100,S,tea,0,0,1
4,Eさん,200,L,coffee,1,0,0
5,Fさん,200,L,tea,0,0,1
6,Gさん,150,M,tea,0,0,1
7,Hさん,200,L,coffee,1,0,0
8,Iさん,100,S,cola,0,1,0
9,Jさん,200,L,tea,0,0,1


Unnamed: 0,user,price,size,drink,cider,coffee,cola,tea
0,Kさん,200,L,cola,0,0,1,0
1,Lさん,100,S,tea,0,0,0,1
2,Mさん,150,M,coffee,0,1,0,0
3,Nさん,150,LL,cider,1,0,0,0


In [13]:
oenc =  OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
train_df = oenc.fit(train[['drink']])
train_oenc = oenc.transform(train[['drink']])
test_oenc = oenc.transform(test[['drink']])

display(pd.DataFrame(train_oenc, columns=['coffee', 'cola', 'tea']))
display(pd.DataFrame(test_oenc, columns=['coffee', 'cola', 'tea']))

Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,1,0,0
8,0,1,0
9,0,0,1


Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,0


In [14]:
lbnr = LabelBinarizer()
lbnr.fit(train['drink'])
display(pd.DataFrame(lbnr.transform(train[['drink']]), columns=['coffee', 'cola', 'tea']))
display(pd.DataFrame(lbnr.transform(test[['drink']]), columns=['coffee', 'cola', 'tea']))

Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,1,0,0
8,0,1,0
9,0,0,1


Unnamed: 0,coffee,cola,tea
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,0


In [15]:
# 学習データにないデータをカラムに追加しても意味ないので、普通こんな処理はしない

lbnr = LabelBinarizer()
lbnr.fit(test['drink'])
display(pd.DataFrame(lbnr.transform(train[['drink']]), columns=['cider', 'coffee', 'cola', 'tea']))
display(pd.DataFrame(lbnr.transform(test[['drink']]), columns=['cider', 'coffee', 'cola', 'tea']))

Unnamed: 0,cider,coffee,cola,tea
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,1,0,0
5,0,0,0,1
6,0,0,0,1
7,0,1,0,0
8,0,0,1,0
9,0,0,0,1


Unnamed: 0,cider,coffee,cola,tea
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,1,0,0,0


In [16]:
# エラーが出る
pd.DataFrame(lbnr.fit_transform(train[['drink', 'size']]), columns=['coffee', 'cola', 'tea', 'L', 'M', 'S'])

ValueError: Multioutput target data is not supported with label binarization

In [15]:
pd.DataFrame(oenc.fit_transform(train[['drink', 'size']]), columns=['coffee', 'cola', 'tea', 'L', 'M', 'S'])

Unnamed: 0,coffee,cola,tea,L,M,S
0,0,1,0,0,0,1
1,0,0,1,0,1,0
2,1,0,0,1,0,0
3,0,0,1,0,0,1
4,1,0,0,1,0,0
5,0,0,1,1,0,0
6,0,0,1,0,1,0
7,1,0,0,1,0,0
8,0,1,0,0,0,1
9,0,0,1,1,0,0


## factorize

In [16]:
ft_array, ft_index = pd.factorize(df['drink']) # tupple型で返却される
df_ft = pd.DataFrame(ft_array, columns=['ft_drink'])
df_factrize = pd.concat([df, df_ft], axis=1) # 元のデータフレームと連結
df_factrize

Unnamed: 0,user,price,size,drink,ft_drink
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,1
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,1
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,1


In [17]:
ft_tuple = pd.factorize(df['drink'])
print(ft_tuple)
print(ft_tuple[0])
print(ft_tuple[1])

(array([0, 1, 2, 1, 2, 1, 1, 2, 0, 1], dtype=int64), Index(['cola', 'tea', 'coffee'], dtype='object'))
[0 1 2 1 2 1 1 2 0 1]
Index(['cola', 'tea', 'coffee'], dtype='object')


In [18]:
ft_array, ft_index = pd.factorize(df['drink'])
print(ft_array)
print((ft_index))

[0 1 2 1 2 1 1 2 0 1]
Index(['cola', 'tea', 'coffee'], dtype='object')


In [19]:
df_ft = pd.DataFrame(ft_array, columns=['ft_drink'])
df_ft

Unnamed: 0,ft_drink
0,0
1,1
2,2
3,1
4,2
5,1
6,1
7,2
8,0
9,1


In [20]:
df_factrize = pd.concat([df, df_ft], axis=1)
df_factrize

Unnamed: 0,user,price,size,drink,ft_drink
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,1
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,1
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,1


## LabelEncoder

In [21]:
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()
lenc.fit(df['drink'])
lenc_vec = lenc.transform(df['drink'])
df_le = pd.DataFrame(lenc_vec, columns=['le_drink'])
df_lenc = pd.concat([df, df_le], axis=1) # 元のデータフレームと連結
df_lenc

Unnamed: 0,user,price,size,drink,le_drink
0,Aさん,100,S,cola,1
1,Bさん,150,M,tea,2
2,Cさん,200,L,coffee,0
3,Dさん,100,S,tea,2
4,Eさん,200,L,coffee,0
5,Fさん,200,L,tea,2
6,Gさん,150,M,tea,2
7,Hさん,200,L,coffee,0
8,Iさん,100,S,cola,1
9,Jさん,200,L,tea,2


In [22]:
lenc = LabelEncoder()
lenc.fit(df['drink'])
lenc_vec = lenc.transform(df['drink'])
lenc_vec

array([1, 2, 0, 2, 0, 2, 2, 0, 1, 2])

In [23]:
df_le = pd.DataFrame(lenc_vec, columns=['le_drink'])

In [24]:
df_lenc = pd.concat([df, df_le], axis=1)
df_lenc

Unnamed: 0,user,price,size,drink,le_drink
0,Aさん,100,S,cola,1
1,Bさん,150,M,tea,2
2,Cさん,200,L,coffee,0
3,Dさん,100,S,tea,2
4,Eさん,200,L,coffee,0
5,Fさん,200,L,tea,2
6,Gさん,150,M,tea,2
7,Hさん,200,L,coffee,0
8,Iさん,100,S,cola,1
9,Jさん,200,L,tea,2


## 違い

In [25]:
train_ft, idx = pd.factorize(train['drink'])
test_ft, idx = pd.factorize(test['drink'])

train_df = pd.concat([train, pd.DataFrame(train_ft, columns=['ft_drink'])], axis=1)
test_df = pd.concat([test, pd.DataFrame(test_ft, columns=['ft_drink'])], axis=1)

display(train_df)
display(test_df)

Unnamed: 0,user,price,size,drink,ft_drink
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,1
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,1
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,1


Unnamed: 0,user,price,size,drink,ft_drink
0,Kさん,200,L,cola,0
1,Lさん,100,S,tea,1
2,Mさん,150,M,coffee,2
3,Nさん,150,LL,cider,3


In [26]:
all_df = pd.concat([train, test], axis=0).reset_index(drop=True)
all_ft, idx = pd.factorize(all_df['drink'])
train_test_df = pd.concat([all_df, pd.DataFrame(all_ft, columns=['ft_drink'])], axis=1)
train_test_df

Unnamed: 0,user,price,size,drink,ft_drink
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,1
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,1
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,1


In [27]:
lenc = LabelEncoder()
train_df = lenc.fit(['cola', 'tea', 'coffee', 'cider'])
train_lenc = lenc.transform(train[['drink']])
test_lenc = lenc.transform(test[['drink']])

train_df = pd.concat([train, pd.DataFrame(train_lenc, columns=['le_drink'])], axis=1)
test_df = pd.concat([test, pd.DataFrame(test_lenc, columns=['le_drink'])], axis=1)

display(train_df)
display(test_df)

Unnamed: 0,user,price,size,drink,le_drink
0,Aさん,100,S,cola,2
1,Bさん,150,M,tea,3
2,Cさん,200,L,coffee,1
3,Dさん,100,S,tea,3
4,Eさん,200,L,coffee,1
5,Fさん,200,L,tea,3
6,Gさん,150,M,tea,3
7,Hさん,200,L,coffee,1
8,Iさん,100,S,cola,2
9,Jさん,200,L,tea,3


Unnamed: 0,user,price,size,drink,le_drink
0,Kさん,200,L,cola,2
1,Lさん,100,S,tea,3
2,Mさん,150,M,coffee,1
3,Nさん,150,LL,cider,0


# 順序尺度

In [28]:
df['ordinal_size'] = df['size'].apply(lambda x: ['S', 'M', 'L', 'LL'].index(x))
df

Unnamed: 0,user,price,size,drink,ordinal_size
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,0
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,2
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,2


In [17]:
# df['ordinal_size'] = df['size'].map(lambda x: ['S', 'M', 'L', 'LL'].index(x))
df['ordinal_size'] = df['size'].map({'S': 0, 'M': 1, 'L': 2, 'LL':3})
df

Unnamed: 0,user,price,size,drink,ordinal_size
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,0
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,2
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,2


In [30]:
unique_size_list = list(all_df['size'].unique())

train['ordinal_size'] = train['size'].apply(lambda x: unique_size_list.index(x))
test['ordinal_size'] = test['size'].apply(lambda x: unique_size_list.index(x))

display(train)
display(test)

Unnamed: 0,user,price,size,drink,ordinal_size
0,Aさん,100,S,cola,0
1,Bさん,150,M,tea,1
2,Cさん,200,L,coffee,2
3,Dさん,100,S,tea,0
4,Eさん,200,L,coffee,2
5,Fさん,200,L,tea,2
6,Gさん,150,M,tea,1
7,Hさん,200,L,coffee,2
8,Iさん,100,S,cola,0
9,Jさん,200,L,tea,2


Unnamed: 0,user,price,size,drink,ordinal_size
0,Kさん,200,L,cola,2
1,Lさん,100,S,tea,0
2,Mさん,150,M,coffee,1
3,Nさん,150,LL,cider,3


In [31]:
ft_array, ft_index = pd.factorize(df['size']) # tupple型で返却される
df_ft = pd.DataFrame(ft_array, columns=['ft_size'])
df_factrize = pd.concat([df, df_ft], axis=1) # 元のデータフレームと連結
df_factrize

Unnamed: 0,user,price,size,drink,ordinal_size,ft_size
0,Aさん,100,S,cola,0,0
1,Bさん,150,M,tea,1,1
2,Cさん,200,L,coffee,2,2
3,Dさん,100,S,tea,0,0
4,Eさん,200,L,coffee,2,2
5,Fさん,200,L,tea,2,2
6,Gさん,150,M,tea,1,1
7,Hさん,200,L,coffee,2,2
8,Iさん,100,S,cola,0,0
9,Jさん,200,L,tea,2,2


In [32]:
lenc = LabelEncoder()
lenc.fit(df['size'])
lenc_vec = lenc.transform(df['size'])
df_le = pd.DataFrame(lenc_vec, columns=['le_size'])
df_lenc = pd.concat([df, df_le], axis=1) # 元のデータフレームと連結
df_lenc

Unnamed: 0,user,price,size,drink,ordinal_size,le_size
0,Aさん,100,S,cola,0,2
1,Bさん,150,M,tea,1,1
2,Cさん,200,L,coffee,2,0
3,Dさん,100,S,tea,0,2
4,Eさん,200,L,coffee,2,0
5,Fさん,200,L,tea,2,0
6,Gさん,150,M,tea,1,1
7,Hさん,200,L,coffee,2,0
8,Iさん,100,S,cola,0,2
9,Jさん,200,L,tea,2,0
