In [22]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

In [2]:
data = pd.DataFrame({
    'Score':['Low','Low','Medium','Medium','High']
})

# 매핑 딕셔너리 생성
scale_mapper = {
    'Low' : 1,
    'Medium' : 2,
    'High' : 3
}

In [3]:
data = data["Score"].replace(scale_mapper)  # 특성을 정수로 변환
print(data)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64


In [4]:
features_array = np.array([['Low',10], ['High',50], ['Medium',3]])

In [5]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit_transform(features_array)
ordinal_encoder_data = ordinal_encoder.categories_

print(ordinal_encoder_data)

[array(['High', 'Low', 'Medium'], dtype='<U11'), array(['10', '3', '50'], dtype='<U11')]


# 특성 딕셔너리 인코딩

In [6]:
data_dict = [{'Red':2, 'Blue':4},
            {'Red':4, 'Blue':3},
            {'Red':1, 'Yellow':2},
            {'Red':1, 'Yellow':2}]

In [10]:
dictVectorizer = DictVectorizer(sparse=False)
features = dictVectorizer.fit_transform(data_dict)
print(features)

[[4. 2. 0.]
 [3. 4. 0.]
 [0. 1. 2.]
 [0. 1. 2.]]


In [12]:
feature_names = dictVectorizer.get_feature_names_out()
print(feature_names)

['Blue' 'Red' 'Yellow']


In [13]:
dict_data = pd.DataFrame(features, columns=feature_names)
print(dict_data)

   Blue  Red  Yellow
0   4.0  2.0     0.0
1   3.0  4.0     0.0
2   0.0  1.0     2.0
3   0.0  1.0     2.0


# 누락된 클래스 값 대처 하기 (KNN)

In [15]:
x = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.20, -1.15]])

x_with_nan = np.array([[np.nan, 0.87, 1.31], [np.nan,-0.67, -0.22]])

In [16]:
clf = KNeighborsClassifier(3, weights='distance') # k=3 , 가장 가까운 값 3개를 찾아 가중치 부여하여 예측

print(x[:,1:])
print(x[:,0])

[[ 2.1   1.45]
 [ 1.18  1.33]
 [ 1.22  1.27]
 [-0.2  -1.15]]
[0. 1. 0. 1.]


In [21]:
train_model = clf.fit(x[:,1:], x[:,0]) # 훈련
imputed_values = train_model.predict(x_with_nan[:,1:]) #예측
print(imputed_values)

[0. 1.]


In [19]:
# 예측된 클래스와 원본을 열로 합침
x_with_imputed = np.hstack((imputed_values.reshape(-1,1), x_with_nan[:,1:]))
data = np.vstack((x_with_imputed, x))

print(data)

[[ 0.    0.87  1.31]
 [ 1.   -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]


# 누락된 클래스 값 대처 하기 (자주 등장하는 값)

In [23]:
x_complete = np.vstack((x_with_nan,x))
print(x_complete)

[[  nan  0.87  1.31]
 [  nan -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]


In [24]:
imputer = SimpleImputer(strategy='most_frequent')  # strategy --> most_frequent=자주 등장값, mean=평균, median=중앙값
data_imputer = imputer.fit_transform(x_complete)
print(data_imputer)

[[ 0.    0.87  1.31]
 [ 0.   -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]
