<a href="https://colab.research.google.com/github/ciepielajan/sklearn/blob/main/sklearn_Pipeline_ColumnTransformer_One_Hot_Encoder_Min_Max_Scaler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [2]:
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/abalone.csv'
dataframe = read_csv(url, header=None)
dataframe[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


# przygotowanie danych do trenowania i trenowanie modelu

In [3]:
# split into inputs and outputs
last_ix = len(dataframe.columns) - 1
X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
print(X.shape, y.shape)

(4177, 8) (4177,)


In [4]:
X.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [5]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

print("id kolumn wg podziału na kategoryczne i liczbowe:")
print(numerical_ix, categorical_ix)

id kolumn wg podziału na kategoryczne i liczbowe:
Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64') Int64Index([0], dtype='int64')


In [6]:
# define the data preparation for the columns
t = [
     ('cat', OneHotEncoder(), categorical_ix),  #na kategorycznych zastosuj OneHotEncoder
     ('num', MinMaxScaler(), numerical_ix)  #na liczbowych zastosuj MinMaxScaler
     ]
col_transform = ColumnTransformer(transformers=t)   #wykonaj transformacje
col_transform

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=True),
                                 Int64Index([0], dtype='int64')),
                                ('num',
                                 MinMaxScaler(copy=True, feature_range=(0, 1)),
                                 Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64'))],
                  verbose=False)

In [7]:
# define the model
model = SVR(kernel='rbf',gamma='scale',C=100)
model

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [8]:
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])   # połącz  transformację kolumn z trenowaniem modelu 
pipeline

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  Int64Index([0], dtype='int64')),
                                                 ('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0

In [9]:
pipeline.fit(X,y)

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  Int64Index([0], dtype='int64')),
                                                 ('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0

In [10]:
pipeline.score(X, y)

0.5810459033343183

#zapisanie plików niezbędnych do zastosowania modelu na produkcji

In [11]:
import pickle

In [12]:
with open('pipeline.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
del pipeline

#produkcja

`wczytanie plików`

In [14]:
with open('pipeline.pickle', 'rb') as handle:
    pipeline = pickle.load(handle)

pipeline

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  Int64Index([0], dtype='int64')),
                                                 ('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0

`predykcja na nowych danych`

In [15]:
dataframe_new = dataframe[5:10]
dataframe_new = dataframe_new[[0,1,2,3,4,5,6,7]]
dataframe_new

Unnamed: 0,0,1,2,3,4,5,6,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32


In [16]:
result = pipeline.predict(dataframe_new)
result

array([ 7.5371649 , 14.91287177, 11.43390762,  9.6712423 , 13.45737976])

In [17]:
dataframe_new["8_predict"] = result
dataframe_new

Unnamed: 0,0,1,2,3,4,5,6,7,8_predict
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,7.537165
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,14.912872
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,11.433908
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9.671242
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,13.45738
