# sklearn preprocessing

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Data Set (titanic)

In [3]:
df = pd.read_csv('https://bit.ly/fc-ml-titanic')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Set 나누기

In [6]:
feature_list = ['Pclass', 'Sex', 'Age', 'Fare']
label_list = ['Survived']
print(df[feature_list].head())
print(df[label_list].head())

   Pclass     Sex   Age     Fare
0       3    male  22.0   7.2500
1       1  female  38.0  71.2833
2       3  female  26.0   7.9250
3       1  female  35.0  53.1000
4       3    male  35.0   8.0500
   Survived
0         0
1         1
2         1
3         1
4         0


In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df[feature_list], df[label_list], test_size=0.2, random_state=30, shuffle=True)
print("train", x_train.shape, y_train.shape)
print("test", x_test.shape, y_test.shape)

train (712, 4) (712, 1)
test (179, 4) (179, 1)


## 전처리 (결측치)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
df['Age'].isnull().sum()

np.int64(177)

### 수치형 데이터에 대한 결측치 처리

In [14]:
df['Age'].fillna(0).describe()

count    891.000000
mean      23.799293
std       17.596074
min        0.000000
25%        6.000000
50%       24.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [17]:
df['Age'].fillna(df['Age'].mean()).describe()

count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64

#### imputer : 2개 이상의 컬럼을 한 번에 처리할때

In [21]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(df[['Age', 'Pclass']])
result = imputer.transform(df[['Age', 'Pclass']])
print(type(result), result.shape)
result

<class 'numpy.ndarray'> (891, 2)


array([[22.        ,  3.        ],
       [38.        ,  1.        ],
       [26.        ,  3.        ],
       ...,
       [29.69911765,  3.        ],
       [26.        ,  1.        ],
       [32.        ,  3.        ]])

In [22]:
df[['Age', 'Pclass']] = result
df[['Age', 'Pclass']].isnull().sum()

Age       0
Pclass    0
dtype: int64

In [23]:
df[['Age', 'Pclass']].describe()

Unnamed: 0,Age,Pclass
count,891.0,891.0
mean,29.699118,2.308642
std,13.002015,0.836071
min,0.42,1.0
25%,22.0,2.0
50%,29.699118,3.0
75%,35.0,3.0
max,80.0,3.0


#### fit_transform() 함수는 fit()과 transform()을 한번에 수행

In [24]:
df = pd.read_csv('https://bit.ly/fc-ml-titanic')
df[['Age', 'Pclass']].isnull().sum()

Age       177
Pclass      0
dtype: int64

In [25]:
imputer = SimpleImputer(strategy='median')
result = imputer.fit_transform(df[['Age', 'Pclass']])
df[['Age', 'Pclass']] = result
df[['Age', 'Pclass']].isnull().sum()

Age       0
Pclass    0
dtype: int64

In [26]:
df[['Age', 'Pclass']].describe()

Unnamed: 0,Age,Pclass
count,891.0,891.0
mean,29.361582,2.308642
std,13.019697,0.836071
min,0.42,1.0
25%,22.0,2.0
50%,28.0,3.0
75%,35.0,3.0
max,80.0,3.0


### Categorical Column Data에 대한 결측치 처리

In [30]:
df = pd.read_csv('https://bit.ly/fc-ml-titanic')
df0 = df.copy()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [31]:
df['Embarked'].fillna('S')

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [32]:
imputer = SimpleImputer(strategy='most_frequent')
result = imputer.fit_transform(df[['Embarked','Cabin']])
df[['Embarked','Cabin']] = result
df[['Embarked','Cabin']].isnull().sum()

Embarked    0
Cabin       0
dtype: int64

## Label Encoding : 문자(Categorical)를 수치(numerical)로 변환

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### 학습을 위해 모든 문자로된 데이터는 수치로 변환

In [35]:
def convert(data):
    if (data == 'male'):
        return 1
    elif (data == 'female'):
        return 0
        
df['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [36]:
df['Sex'].apply(convert)

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int64

In [38]:
le = LabelEncoder()
df['Sex_num'] = le.fit_transform(df['Sex'])
df['Sex_num'].value_counts()

Sex_num
1    577
0    314
Name: count, dtype: int64

In [39]:
le.classes_

array(['female', 'male'], dtype=object)

In [40]:
le.inverse_transform([0,1,1,0,0,1,1])

array(['female', 'male', 'male', 'female', 'female', 'male', 'male'],
      dtype=object)

#### NaN 값이 포함되어 있다면, LabelEncoder가 NaN도 하나의 Category로 인식하여 라벨링을 진행

In [44]:
print(df0['Embarked'].isna().sum())
print(df0['Embarked'].unique())

2
['S' 'C' 'Q' nan]


In [46]:
df0["le_Embarked"] = le.fit_transform(df0["Embarked"])
print(df0["le_Embarked"].unique())
df0[df0["Embarked"].isna()]

[2 0 1 3]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,le_Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,3
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,3


#### 결측치를 처리하고 라벨링을 했을 경우

In [47]:
df['Embarked'] = df['Embarked'].fillna('S')
df['le_Embarked'] = le.fit_transform(df['Embarked'])
df['le_Embarked'].unique()

array([2, 0, 1])

## One Hot Encoding 

In [49]:
df = pd.read_csv('https://bit.ly/fc-ml-titanic')

In [50]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [51]:
df['Embarked'] = df['Embarked'].fillna('S')
df['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [52]:
df['Embarked_num'] = LabelEncoder().fit_transform(df['Embarked'])
df['Embarked_num'].value_counts()

Embarked_num
2    646
0    168
1     77
Name: count, dtype: int64

In [53]:
df['Embarked_num'][:6]

0    2
1    0
2    2
3    2
4    2
5    1
Name: Embarked_num, dtype: int64

In [54]:
pd.get_dummies(df['Embarked_num'][:6]).astype(int)

Unnamed: 0,0,1,2
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
5,0,1,0


In [60]:
oh = pd.get_dummies(df['Embarked_num'][:6])
print(oh)
oh.columns = ['C', 'Q', 'S']
print(oh)

       0      1      2
0  False  False   True
1   True  False  False
2  False  False   True
3  False  False   True
4  False  False   True
5  False   True  False
       C      Q      S
0  False  False   True
1   True  False  False
2  False  False   True
3  False  False   True
4  False  False   True
5  False   True  False


In [61]:
oh.astype(int)

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
5,0,1,0


## 정규화 (Normalization)

In [62]:
movie = {'naver': [2, 4, 6, 8, 10], 'netflix': [1, 2, 3, 4, 5]}
df = pd.DataFrame(movie)
df

Unnamed: 0,naver,netflix
0,2,1
1,4,2
2,6,3
3,8,4
4,10,5


In [63]:
minmaxscaler = MinMaxScaler()
minmaxmovie = minmaxscaler.fit_transform(df)
pd.DataFrame(minmaxmovie, columns=['naver', 'netflix'])

Unnamed: 0,naver,netflix
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


## 표준화 (Standardization)
- 평균이 0, 표준편차가 1인 데이터로 변환

In [65]:
x = np.arange(10)
x[9] = 1000
x.mean(), x.std()

(np.float64(103.6), np.float64(298.8100399919654))

In [68]:
stds = StandardScaler()
scaled = stds.fit_transform(x.reshape(-1, 1))
print(round(scaled.mean(), 2), scaled.std())

0.0 1.0
