# Transformation Pipelines
- github colab : https://homl.info/colab3

In [8]:
import matplotlib
import matplotlib.pyplot
import numpy
import pathlib
import pandas
import sklearn
import sklearn.base
import sklearn.compose
import sklearn.cluster
import sklearn.impute
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics.pairwise
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.utils.validation
import tarfile
import urllib

def ch2_load_housing_data():
    tarball_path = pathlib.Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        pathlib.Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pandas.read_csv(pathlib.Path("datasets/housing/housing.csv"))

def matplotlib_to_imagefile(output_dir, filename, imgext="png", tight_layout=True, resolution=300):
    path = output_dir / f"{filename}.{imgext}"
    if tight_layout:
        matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.savefig(path, format=imgext, dpi=resolution)
    
def stratified_sampling_income_category(input_dataframe):
    input_dataframe = input_dataframe.copy()
    input_dataframe["income_cat"] = pandas.cut(input_dataframe["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., numpy.inf],
                               labels=[1, 2, 3, 4, 5])
    s_train, s_test = sklearn.model_selection.train_test_split(input_dataframe, test_size = 0.2, stratify = input_dataframe['income_cat'], random_state = 42)
    s_train.drop('income_cat', axis=1, inplace=True)
    s_test.drop('income_cat', axis=1, inplace=True)
    
    return s_train, s_test
    
# 저장할 디렉토리 설정
output_dir = pathlib.Path() / "images" / "end_to_end_project"
output_dir.mkdir(parents=True, exist_ok=True)
print(f'output_dir : {output_dir}')

input_dataframe = ch2_load_housing_data()
train, test = stratified_sampling_income_category(input_dataframe)

label = train['median_house_value'].copy()
predictor = train.drop('median_house_value', axis = 1)

imputer = sklearn.impute.SimpleImputer(strategy='median')
predictor_numtype = predictor.select_dtypes(include=[numpy.number])
imputer.fit(predictor_numtype)
X = imputer.transform(predictor_numtype)
predictor_numtype = pandas.DataFrame(X, columns = predictor_numtype.columns, index = predictor_numtype.index)

predictor_category = predictor[['ocean_proximity']]
onehot = sklearn.preprocessing.OneHotEncoder()
predictor_category_1hot = onehot.fit_transform(predictor_category)

output_dir : images\end_to_end_project


In [18]:
# 이걸 사용하면 Scikit-Learn estimator가 interactive diagram으로 render된다
sklearn.set_config(display='diagram')

num_pipeline = sklearn.pipeline.Pipeline([
    ('impute', sklearn.impute.SimpleImputer(strategy='median')),
    ('standardize', sklearn.preprocessing.StandardScaler()),
])

# 이런식으로 접근 가능
print(num_pipeline['impute'])
print(num_pipeline['standardize'])

num_pipeline

SimpleImputer(strategy='median')
StandardScaler()


In [20]:
# Pipeline에 이름을 굳이 넣고 싶지 않다면 다음과 같이 간단히 만들 수도 있다

num_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'), 
    sklearn.preprocessing.StandardScaler())

# 아래를 열어 보면 steps에 이름이 자동으로 생성되어 들어가는걸 볼 수 있음. 그 이름으로 접근도 가능
print(num_pipeline['simpleimputer'])
print(num_pipeline['standardscaler'])

num_pipeline 

SimpleImputer(strategy='median')
StandardScaler()


In [11]:
# Pipeline을 통과하면 각 stage에 대해 fit_transform이 호출된다

predictor_numtype_transformed = num_pipeline.fit_transform(predictor_numtype)
predictor_numtype_transformed

array([[-1.42303652,  1.0136059 ,  1.86111875, ...,  0.13746004,
         1.39481249, -0.93649149],
       [ 0.59639445, -0.702103  ,  0.90762971, ..., -0.69377062,
        -0.37348471,  1.17194198],
       [-1.2030985 ,  1.27611874,  0.35142777, ..., -0.78876841,
        -0.77572662, -0.75978881],
       ...,
       [ 1.25620853, -1.42870103, -1.23772062, ...,  1.26829911,
         0.67913534,  0.1010487 ],
       [ 0.58639727, -0.73960483,  0.66925745, ...,  0.27356264,
         0.88286825,  0.14539615],
       [-1.41803793,  0.94797769,  1.22545939, ..., -0.67915557,
        -0.75221898, -0.31034135]])

In [12]:
# DataFrame으로 복원하려면 다음과 같이 한다

predictor_numtype_trans_df1 = pandas.DataFrame(predictor_numtype_transformed,
                                                columns=predictor_numtype.columns,
                                                index=predictor_numtype.index)
predictor_numtype_trans_df1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
13096,-1.423037,1.013606,1.861119,0.311912,1.368167,0.137460,1.394812,-0.936491
14973,0.596394,-0.702103,0.907630,-0.308620,-0.435925,-0.693771,-0.373485,1.171942
3785,-1.203098,1.276119,0.351428,-0.712240,-0.760709,-0.788768,-0.775727,-0.759789
14689,1.231216,-0.884924,-0.919891,0.702262,0.742306,0.383175,0.731375,-0.850281
20507,0.711362,-0.875549,0.589800,0.790125,1.595753,0.444376,1.755263,-0.180365
...,...,...,...,...,...,...,...,...
14207,0.586397,-0.833359,0.987087,-0.184147,0.140152,-0.445315,0.060101,0.444041
13105,0.131525,0.319822,-0.443146,0.139847,0.128298,-0.005950,0.083608,-0.685630
19301,1.256209,-1.428701,-1.237721,0.586026,0.562134,1.268299,0.679135,0.101049
19121,0.586397,-0.739605,0.669257,0.522417,0.794461,0.273563,0.882868,0.145396


In [13]:
# 책에서는 이렇게 했음. 위에거는 전에 하던대로 해본것. 일단 똑같이 나오는 걸로 보인다

predictor_numtype_trans_df2 = pandas.DataFrame(predictor_numtype_transformed,
                                                columns=num_pipeline.get_feature_names_out(),
                                                index=predictor_numtype.index)
predictor_numtype_trans_df2

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
13096,-1.423037,1.013606,1.861119,0.311912,1.368167,0.137460,1.394812,-0.936491
14973,0.596394,-0.702103,0.907630,-0.308620,-0.435925,-0.693771,-0.373485,1.171942
3785,-1.203098,1.276119,0.351428,-0.712240,-0.760709,-0.788768,-0.775727,-0.759789
14689,1.231216,-0.884924,-0.919891,0.702262,0.742306,0.383175,0.731375,-0.850281
20507,0.711362,-0.875549,0.589800,0.790125,1.595753,0.444376,1.755263,-0.180365
...,...,...,...,...,...,...,...,...
14207,0.586397,-0.833359,0.987087,-0.184147,0.140152,-0.445315,0.060101,0.444041
13105,0.131525,0.319822,-0.443146,0.139847,0.128298,-0.005950,0.083608,-0.685630
19301,1.256209,-1.428701,-1.237721,0.586026,0.562134,1.268299,0.679135,0.101049
19121,0.586397,-0.739605,0.669257,0.522417,0.794461,0.273563,0.882868,0.145396


In [17]:
# 두개가 미묘하게 다르긴 한데 결국 같은 내용인듯
print(type(predictor_numtype.columns))
print(predictor_numtype.columns)
print(type(num_pipeline.get_feature_names_out()))
print(num_pipeline.get_feature_names_out())

<class 'pandas.core.indexes.base.Index'>
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')
<class 'numpy.ndarray'>
['longitude' 'latitude' 'housing_median_age' 'total_rooms'
 'total_bedrooms' 'population' 'households' 'median_income']


### ColumnTransformer
- 앞서 number인 부분과 category인 부분을 아예 matrix를 나눠서 처리하고 있었다
- ColumnTransformer를 사용하면 이것들을 묶어서 한꺼번에 처리할 수 있다

In [22]:
num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms", 
                "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

cat_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='most_frequent'),
    sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'))

# 앞서 보았듯 (p4_imputer_and_category_encoding.ipynb) OneHotEncoder는 sparse matrix를 반환한다
# 그리고 num_pipeline은 dense matrix (numpy array)를 반환한다
# 이렇게 type이 섞이는 경우 ColumnTransformer는 final matrix의 density를 계산한 뒤
# 특정한 threshold (default : sparse_threshold=0.3)를 넘으면 dense matrix를 반환하고
# 그렇지 않으면 sparse matrix를 반환한다
# 책의 예제는 dense matrix를 반환한다. 근데 여기서 density는 0.2일텐데? 음.. ㅋㅋㅋㅋ

preprocessing = sklearn.compose.ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

In [23]:
# column을 저런 식으로 일일히 지정하는건 꽤 번거로운 일이다
# make_column_transformer + make_column_selector를 조합하면 좀 더 쉽게 만들 수 있다

preprocessing = sklearn.compose.make_column_transformer(
    (num_pipeline, sklearn.compose.make_column_selector(dtype_include=numpy.number)),
    (cat_pipeline, sklearn.compose.make_column_selector(dtype_include=object)),
)

# 위를 이용해서 한번에 fit_transform을 이렇게 할 수 있다

predictor_transformed = preprocessing.fit_transform(predictor)

## Transformer 종합
### 지금까지 해온 것들을 하나의 Transformer로 만들기
- Numerical feature에서 NA/null은 median으로 impute한다. ML에서 NA/null은 있는 채로 놔두면 안됨
- Categorical feature는 one-hot encoding을 쓸거다. ML에서 모든건 숫자로 표기되어야 함
- 어떤 것들은 그냥 쓰기 보다는 ratio로 볼 것이다 (bedrooms_ratio, rooms_per_house, people_per_house)
- Cluster similarity feature를 사용해서 latitude와 longitude를 대체한다
- Long tail을 가진 것들은 Log로 대체해서 가능한 bell-shaped로 만든다
- 모든 numerical feature는 최종적으로 standardize 해서 사용할 것이다

In [42]:
def column_ratio(X):
    return X[:,[0]] / X[:,[1]]

#
# 여기에서 왜 length하나짜리 list를 반환하는지 꽤 헷갈렸다
# 아래 실행한걸 보면 알수 있듯이 ColumnTransformer를 쓰면 feature의 앞부분에 일단 
# name이 알아서 앞에 붙는다
# 그래서 name__ratio 이런식으로 나오게 하는거다
#
def ratio_name(function_transformer, feature_names_in):
    return ['ratio']

def ratio_pipeline():
    return sklearn.pipeline.make_pipeline(
        sklearn.impute.SimpleImputer(strategy='median'),
        sklearn.preprocessing.FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        # sklearn.preprocessing.FunctionTransformer(numpy.log, feature_names_out='one-to-one'),
        sklearn.preprocessing.StandardScaler(),
    )

class ClusterSimilarity(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = sklearn.cluster.KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return sklearn.metrics.pairwise.rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

#
# numpy.log는 AxB matrix를 받아서 AxB matrix를 반환한다
# 즉 아래에서 5 column 짜리 matrix를 넣으면 5 column 짜리 matrix를 반환한다
# 그래서 각각 이름을 붙이려면 length가 5인 array를 리턴하는 함수(위의 ratio_name 스타일인데 5개짜리 array를 리턴하는)
# 를 넣어주던지 아니면 그냥 이름을 그대로 쓰는 one-to-one을 쓰던지 하면 된다
#
log_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'),
    sklearn.preprocessing.FunctionTransformer(numpy.log, feature_names_out='one-to-one'),
    sklearn.preprocessing.StandardScaler(),
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)

default_num_pipeline = sklearn.pipeline.make_pipeline(
    sklearn.impute.SimpleImputer(strategy='median'),
    sklearn.preprocessing.StandardScaler()
)

preprocessing = sklearn.compose.ColumnTransformer([
    # column_ratio는 Nx2 matrix를 받아서 Nx1 matrix를 반환한다
    ('bedrooms', ratio_pipeline(), ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house', ratio_pipeline(), ['total_rooms', 'households']),
    ('people_per_house', ratio_pipeline(), ['population', 'households']),
    # numpy.log는 AxB matrix를 받아서 AxB matrix를 반환한다
    # 즉 아래에서 5 column 짜리 matrix를 넣으면 5 column 짜리 matrix를 반환한다
    ('log', log_pipeline, ['total_bedrooms', 'total_rooms', 'population','households','median_income']),
    ('geo', cluster_simil, ['latitude','longitude']),
    ('cat', cat_pipeline, sklearn.compose.make_column_selector(dtype_include=object)),
],
    remainder=default_num_pipeline) # one column remaining : housing_median_age

predictor_prepared = preprocessing.fit_transform(predictor)

print(predictor_prepared.shape)

preprocessing.get_feature_names_out()



(16512, 24)


array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)