In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

# 예제 데이터
data = {
    "Height": [160, 170, 180, 190],
    "Weight": [55, 65, 75, 85],
    "Income": [1000000, 2000000, 3000000, 4000000]
}
df = pd.DataFrame(data)
print("원본 데이터\n", df)

# 표준화
scaler = StandardScaler()
standard_scaled = scaler.fit_transform(df)
print("\nStandardScaler 결과\n", pd.DataFrame(standard_scaled, columns=df.columns))

# 정규화
scaler = MinMaxScaler()
minmax_scaled = scaler.fit_transform(df)
print("\nMinMaxScaler 결과\n", pd.DataFrame(minmax_scaled, columns=df.columns))

# MaxAbs 스케일링
scaler = MaxAbsScaler()
maxabs_scaled = scaler.fit_transform(df)
print("\nMaxAbsScaler 결과\n", pd.DataFrame(maxabs_scaled, columns=df.columns))

# Robust 스케일링
scaler = RobustScaler()
robust_scaled = scaler.fit_transform(df)
print("\nRobustScaler 결과\n", pd.DataFrame(robust_scaled, columns=df.columns))


원본 데이터
    Height  Weight   Income
0     160      55  1000000
1     170      65  2000000
2     180      75  3000000
3     190      85  4000000

StandardScaler 결과
      Height    Weight    Income
0 -1.341641 -1.341641 -1.341641
1 -0.447214 -0.447214 -0.447214
2  0.447214  0.447214  0.447214
3  1.341641  1.341641  1.341641

MinMaxScaler 결과
      Height    Weight    Income
0  0.000000  0.000000  0.000000
1  0.333333  0.333333  0.333333
2  0.666667  0.666667  0.666667
3  1.000000  1.000000  1.000000

MaxAbsScaler 결과
      Height    Weight  Income
0  0.842105  0.647059    0.25
1  0.894737  0.764706    0.50
2  0.947368  0.882353    0.75
3  1.000000  1.000000    1.00

RobustScaler 결과
      Height    Weight    Income
0 -1.000000 -1.000000 -1.000000
1 -0.333333 -0.333333 -0.333333
2  0.333333  0.333333  0.333333
3  1.000000  1.000000  1.000000


In [4]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

#1 
scaler = MinMaxScaler()
minMaxScaler_output = scaler.fit_transform(data)
print(minMaxScaler_output)

[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [1.   1.  ]]


In [7]:
data = [[-1, -102], [-0.5, 6], [0, 10], [1, 18]]

# 이상치가 포함된 데이터에서 StandardScaler와 RobustScaler 결과를 비교해보세요.

scaler = StandardScaler()
standard_output = scaler.fit_transform(data)
print(f"StandardScaler: \n {standard_output}")

scaler = RobustScaler()
robust_output = scaler.fit_transform(data)
print(f"RobustScaler: \n {robust_output}")

StandardScaler: 
 [[-1.18321596 -1.72537713]
 [-0.50709255  0.46686675]
 [ 0.16903085  0.54806097]
 [ 1.52127766  0.71044941]]
RobustScaler: 
 [[-0.85714286 -3.33333333]
 [-0.28571429 -0.06060606]
 [ 0.28571429  0.06060606]
 [ 1.42857143  0.3030303 ]]


In [None]:
# KNN 알고리즘에서 스케일링을 적용하지 않으면 어떤 문제가 생길까요?
너무 분산되어 있는 데이터셋은 

In [16]:
import pandas as pd 
import seaborn as sns 
data = sns.load_dataset("penguins")
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [17]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
penguins = sns.load_dataset("penguins")

# 결측치 제거
penguins = penguins.dropna()

# 독립변수(X): 수치형 feature 선택
X = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]

# 종속변수(y): species 컬럼 → 숫자 인코딩
le = LabelEncoder()
y = le.fit_transform(penguins["species"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("클래스 매핑:", dict(zip(le.classes_, le.transform(le.classes_))))


X shape: (333, 4)
y shape: (333,)
클래스 매핑: {'Adelie': np.int64(0), 'Chinstrap': np.int64(1), 'Gentoo': np.int64(2)}


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 데이터 나누기: 훈련 데이터와 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# KNN 모델 생성 (K=3)
knn = KNeighborsClassifier(n_neighbors=3)

# 모델 훈련
knn.fit(X_train, y_train)

# 예측
y_pred = knn.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'KNN 모델 정확도: {accuracy:.2f}')


KNN 모델 정확도: 0.76


In [24]:
scaler = StandardScaler() 
X2 = scaler.fit_transform(X)

# 데이터 나누기: 훈련 데이터와 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=42)

# KNN 모델 생성 (K=3)
knn = KNeighborsClassifier(n_neighbors=3)

# 모델 훈련
knn.fit(X_train, y_train)

# 예측
y_pred = knn.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'KNN 모델 정확도 (Scaler 사용): {accuracy:.2f}')


KNN 모델 정확도 (Scaler 사용): 1.00


In [None]:
a b c

a1 b1 c1 y1
a2 b2 c2 y2
a3 b3 c3 y3
a: 0-1 
b: 100 - 1000
c = 10000000000 
(a-a1)^2 + (b-b1)^2 + (c-c1)^2 = 0.1 + 300 + 10000000000
(a-a2)^2 + (b-b2)^2 + (c-c2)^2 = 0.9 + 600 + 10000000000
(a-a3)^2 + (b-b3)^2 + (c-c3)^2 = 0.3 + 100 + 10000000000

a : 0 -1 
b: 0- 1
c: 0-1 

1 0.1+0.2 + 0.4 = 0.7
2 0.0 + 0.2 + 0.5 = 0.7
3 0.3 + 0.1 + 0.6 = 1.1
