### LabelEncoder
* sklearn.preprocessing.LabelEncoder
* fit(y) 
* fit_transform(y)

Fit label encoder and return encoded labels.

* get_params([deep])

Get parameters for this estimator.

* inverse_transform(y)

Transform labels back to original encoding.

* set_params(**params)

Set the parameters of this estimator.

* transform(y)

Transform labels to normalized encoding.

In [2]:
from sklearn.preprocessing import LabelEncoder
items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
# LabelEncoder를 객체로 생성한 후, fit()과 transform()으로 label 인코딩 수행
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값: ', labels)

인코딩 변환값:  [0 1 4 5 3 3 2 2]


In [3]:
print('인코딩 클래스: ', encoder.classes_)
print('디코딩 원본 값: ', encoder.inverse_transform([4,5,2,0,1,1,3,3]))

인코딩 클래스:  ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']
디코딩 원본 값:  ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


### titanic_train.csv 전처리

In [29]:
import pandas as pd

# df = pd.read_csv('C:/apps/ml/datasets/titanic_train.csv')
df = pd.read_csv('./datasets/titanic_train.csv')

In [30]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
df.drop(columns = ['PassengerId','Pclass','Name','SibSp', 'Parch','Ticket','Fare','Cabin'],inplace=True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   Embarked  889 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [33]:
df['Age'].isnull().sum()

177

In [34]:
df['Age']  = df['Age'].fillna(df['Age'].mean() )

In [35]:
df['Age'].isnull().sum()

0

In [36]:
df['Embarked'] = df['Embarked'].fillna('S')

In [37]:
df['Embarked'].isnull().sum()

0

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   Embarked  891 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [39]:
# Sex , Embarked encoding 
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Embarked'] = encoder.fit_transform(df['Embarked'])

In [40]:
df

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,1,22.000000,2
1,1,0,38.000000,0
2,1,0,26.000000,2
3,1,0,35.000000,2
4,0,1,35.000000,2
...,...,...,...,...
886,0,1,27.000000,2
887,1,0,19.000000,2
888,0,0,29.699118,2
889,1,1,26.000000,0


### OneHotEncoder

In [60]:
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv('./datasets/titanic_train.csv')
one_encoder = OneHotEncoder()
trans_data = one_encoder.fit_transform([df['Sex']])
trans_data.toarray()

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [53]:
import numpy as np 
items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

one_encoder = OneHotEncoder()
# items = items.reshape(-1,1)
# np.array(items).reshape(-1,1)
items = np.array(items).reshape(-1,1)
data = one_encoder.fit_transform(items)

In [57]:
data.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

### pandas.get_dummies
* pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [61]:
import pandas as pd 
df = pd.read_csv('./datasets/titanic_train.csv')
df.drop(columns = ['PassengerId','Pclass','Name','SibSp', 'Parch','Ticket','Fare','Cabin'],inplace=True)
df['Age']  = df['Age'].fillna(df['Age'].mean() )
df['Embarked'] = df['Embarked'].fillna('S')





In [64]:
data = df.loc[:, 'Embarked']

In [65]:
data

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [66]:
dummy_embark = pd.get_dummies(data)

In [68]:
dummy_data = pd.get_dummies(df[['Sex','Embarked']])

In [69]:
dummy_data

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
886,0,1,0,0,1
887,1,0,0,0,1
888,1,0,0,0,1
889,0,1,1,0,0


In [70]:
dumy_df = pd.concat([df, dummy_data], axis=1)

In [73]:
dumy_df.drop(columns=['Sex','Embarked' ] , inplace=True)

In [74]:
dumy_df

Unnamed: 0,Survived,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.000000,0,1,0,0,1
1,1,38.000000,1,0,1,0,0
2,1,26.000000,1,0,0,0,1
3,1,35.000000,1,0,0,0,1
4,0,35.000000,0,1,0,0,1
...,...,...,...,...,...,...,...
886,0,27.000000,0,1,0,0,1
887,1,19.000000,1,0,0,0,1
888,0,29.699118,1,0,0,0,1
889,1,26.000000,0,1,1,0,0
