In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0,1))
scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [3]:
preprocessing.MinMaxScaler().fit_transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [6]:
preprocessing.MinMaxScaler().fit_transform(feature[3:])

array([[0.],
       [1.]])

In [7]:
scaler = preprocessing.MinMaxScaler().fit(feature[:3])
scaler.transform(feature[3:])

array([[1.2],
       [2.8]])

# 특성을 표준화하기

In [8]:
x = np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])

scaler = preprocessing.StandardScaler()

standardized = scaler.fit_transform(x)

standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [10]:
print("평균 :",round(standardized.mean()))
print("표준편차 :",standardized.std())

평균 : 0
표준편차 : 1.0


In [11]:
robust_scaler = preprocessing.RobustScaler()

robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

robust_scaler : 데이터에 이상치가 많은 경우 이의 영향을 최소화하기 위해서 사용함

In [12]:
preprocessing.QuantileTransformer().fit_transform(x)

  % (self.n_quantiles, n_samples))


array([[0.  ],
       [0.25],
       [0.5 ],
       [0.75],
       [1.  ]])

# 정규화 하기

In [13]:
from sklearn.preprocessing import Normalizer

features = np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.4],[10.9,3.3]])

normalizer = Normalizer(norm = 'l2')

normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [14]:
features_l2_norm = Normalizer(norm = 'l2').transform(features)

features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [16]:
feature_l1_norm = Normalizer(norm = 'l1').transform(features)

feature_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [19]:
print("첫 번째 샘플값의 합 :", feature_l1_norm[0,0] + feature_l1_norm[0,1])

첫 번째 샘플값의 합 : 1.0


normalizer는 fit 메서드를 수행하지 않는다. 바로 transform을 사용

In [20]:
Normalizer(norm = 'max').transform(features)

array([[1.        , 1.        ],
       [0.32352941, 1.        ],
       [0.07425743, 1.        ],
       [0.04738372, 1.        ],
       [1.        , 0.30275229]])

# 다항 특성과 교차항 특성 생성하기

In [23]:
from sklearn.preprocessing import PolynomialFeatures

features = np.array([[2,3],[2,3],[2,3]])

polynomial_interaction = PolynomialFeatures(degree = 2, include_bias = False)

polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [24]:
interaction = PolynomialFeatures(degree = 2, include_bias = False, interaction_only = True)

interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [25]:
polynomial_interaction.get_feature_names()

['x0', 'x1', 'x0^2', 'x0 x1', 'x1^2']

# 특성 변환하기

In [26]:
from sklearn.preprocessing import FunctionTransformer

features = np.array([[2,3],[2,3],[2,3]])

def add_ten(x):
    return x + 10

ten_transformer = FunctionTransformer(add_ten)

ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [27]:
import pandas as pd

df = pd.DataFrame(features, columns = ['feature_1','feature_2'])

df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [28]:
FunctionTransformer(add_ten,validate = False).transform(np.array([1,2,3]))

array([11, 12, 13])

In [29]:
from sklearn.compose import ColumnTransformer

def add_hundred(x):
    return x + 100

ct = ColumnTransformer([("add_ten",FunctionTransformer(add_ten,validate = True),['feature_1']),
                       ("add_hundred",FunctionTransformer(add_hundred, validate = True), ['feature_2'])])

In [30]:
ct.fit_transform(df)

array([[ 12, 103],
       [ 12, 103],
       [ 12, 103]])

# 이상치 감지하기

In [31]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

features,_ = make_blobs(n_samples = 10, n_features = 2, centers =1, random_state =1)

features[0,0] = 10000
features[0,1] = 10000

outlier_detector = EllipticEnvelope(contamination = .1)  #contamination : 이상치 비율

outlier_detector.fit(features)

outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [32]:
feature = features[:,0]

def indices_of_outliers(x):
    q1,q3 = np.percentile(x,[25,75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x>upper_bound) | (x<lower_bound))

indices_of_outliers(feature)

(array([0], dtype=int64),)

# 이상치 다루기

In [33]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500,2500,1500,48000]

In [34]:
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [35]:
houses['Outlier'] = np.where(houses['Bathrooms'] < 20,0,1)

houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [36]:
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]

houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


# 특성 이산화하기

In [37]:
from sklearn.preprocessing import Binarizer

age = np.array([[6],[12],[20],[36],[65]])

binarizer = Binarizer(18)

binarizer.fit_transform(age)



array([[0],
       [0],
       [1],
       [1],
       [1]])

In [38]:
np.digitize(age,bins = [20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [39]:
np.digitize(age,bins = [20,30,64], right = True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [40]:
np.digitize(age,bins = [18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

In [43]:
from sklearn.preprocessing import KBinsDiscretizer

kb = KBinsDiscretizer(4,encode = 'ordinal',strategy = 'quantile')
kb.fit_transform(age)

array([[0.],
       [1.],
       [2.],
       [3.],
       [3.]])

In [45]:
kb = KBinsDiscretizer(4,encode = 'onehot-dense',strategy = 'quantile')
kb.fit_transform(age)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [46]:
kb = KBinsDiscretizer(4,encode = 'onehot-dense',strategy = 'uniform')
kb.fit_transform(age)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [48]:
kb.bin_edges_

array([array([ 6.  , 20.75, 35.5 , 50.25, 65.  ])], dtype=object)

# 군집으로 샘플을 그룹으로 묶기

In [49]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

features,_ = make_blobs(n_samples = 50, n_features = 2, centers = 3, random_state = 1)
dataframe = pd.DataFrame(features, columns = ['feature_1','feature_2'])
clusterer = KMeans(3, random_state = 0)
clusterer.fit(features)

dataframe['group'] = clusterer.predict(features)

dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


# 누락된 값을 가진 샘플을 삭제하기

In [50]:
features= np.array([[1.1,11.1],[2.2,22.2],[3.3,33.3],[4.4,44.4],[np.nan,55.5]])

features[~np.isnan(features).any(axis = 1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [51]:
dataframe = pd.DataFrame(features, columns = ['feature_1','feature_2'])
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


# 누락된 값 채우기

In [57]:
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import make_blobs

features,_ = make_blobs(n_samples = 1000, n_features = 2, random_state= 1)

ImportError: cannot import name 'get_registered_name' from 'tensorflow.python.keras.utils.generic_utils' (C:\Users\jlee0\anaconda3\lib\site-packages\tensorflow\python\keras\utils\generic_utils.py)

In [None]:
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

features_knn_imputed = KNN(k=5, verbose = 0).fit_transform(standardized_features)

print('실제 값 :',true_value)
print('대체된 값 :',features_knn_imputed[0,0])

In [None]:
from sklearn.preprocessing import Imputer

mean_imputer = Imputer(strategy = 'mean',axis = 0)

features_mean_imputed = mean_imputer.fit_transform(features)

print('실제 값 :',true_value)
print('대체된 값 :',features_mean_imputed[0,0])