<a href="https://colab.research.google.com/github/dschloe/inflern_ml/blob/main/Scikit_Learn_Pipeline_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 개요
- Scikit-Learn Pipeline 라이브러리를 활용하여 머신러닝을 개발한다. 

# 데이터 불러오기

In [12]:
import pandas as pd
import numpy as np
data = pd.read_csv('https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  rentals     731 non-null    int64  
dtypes: float64(4), int64(9), object(1)
memory usage: 80.1+ KB


# 데이터 추출

In [13]:
cols = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'rentals']
data = data[cols]
data['mnth'].value_counts()

1     62
3     62
5     62
7     62
8     62
10    62
12    62
4     60
6     60
9     60
11    60
2     57
Name: mnth, dtype: int64

# 데이터셋 분리

In [14]:
from sklearn.model_selection import train_test_split
X = data.drop('rentals',axis=1)
y = data['rentals']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Pipeline 구축

In [15]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('ordEncoder', OrdinalEncoder())
])

onehot_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('oheEncoder', OneHotEncoder())                                   
])

# 수치형 데이터 및 Categorical 데이터 컬럼 분리

numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
ordinal_features = ['holiday', 'weekday', 'workingday', 'weathersit']
onehot_features  = ['season', 'mnth']

# numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
# categorical_features = data.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns

preprocessor = ColumnTransformer(
   transformers=[
     ('numeric', numeric_transformer, numeric_features)
   , ('ord_categorical', ordinal_transformer, ordinal_features)
   , ('ohe_categorical', onehot_transformer, onehot_features)
])

# 모델 개발 

In [16]:
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestRegressor())
           ])

rf_model = pipeline.fit(X_train, y_train)
print(rf_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('ord_categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ordEncoder',
                                                                   OrdinalEncoder())]),
                 

# 모델 평가 

In [17]:
from sklearn.metrics import r2_score
predictions = rf_model.predict(X_test)
print (r2_score(y_test, predictions))

0.7704600824862011


# 다중 모형 개발 시

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

pipe_rf = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('rf', RandomForestRegressor())
           ])

pipe_dt = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('dt', DecisionTreeRegressor())
           ])

regressors = [pipe_rf, pipe_dt]
model_names = ["RandomForest", "Decition Tree"]
for idx, regressor in enumerate(regressors):
  regressor.fit(X_train, y_train)
  predictions = regressor.predict(X_test)
  print(f'{model_names[idx]}: r2 score:{r2_score(predictions, y_test)}')

RandomForest: r2 score:0.7528476003335457
Decition Tree: r2 score:0.6433342088873556


- 좀 더 쉽게 하면 아래와 같이 할 수 있다. 

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

regressors = [
    RandomForestRegressor()
   ,DecisionTreeRegressor()
]

# regressors = [pipe_rf, pipe_dt]
for regressor in regressors:
    pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',regressor)
           ])
    model = pipeline.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(regressor)
    print(f'Model r2 score:{r2_score(predictions, y_test)}')

RandomForestRegressor()
Model r2 score:0.7408932278672407
DecisionTreeRegressor()
Model r2 score:0.6386493540856627


In [41]:
pipeline.get_params()

{'memory': None, 'preprocessor': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler', StandardScaler())]),
                                  ['temp', 'atemp', 'hum', 'windspeed']),
                                 ('ord_categorical',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='constant')),
                                                  ('ordEncoder',
                                                   OrdinalEncoder())]),
                                  ['holiday', 'weekday', 'workingday',
                                   'weathersit']),
                                 ('ohe_categorical',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='constant')),
                      