# ML. Scikit-learn. Pipeline

#### Более подробную информацию по применению pipeline можно найти в ноутбуках Github Aleksander Milenkin либо в открытых лекциях OTUS на Youtube

In [1]:
import pandas as pd
import numpy as np

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
import sklearn
sklearn.set_config(display='diagram')

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_rows', 50)
pd.get_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,}'.format)

In [5]:
file_path = '/content/drive/MyDrive/Datasets/data.csv'
data = pd.read_csv(file_path, sep=';')

In [6]:
data.head()

Unnamed: 0,city,manager,product,promo,prev_volume,percent,volume
0,c5,m2,pr7,0,13750,7,3465
1,c1,m4,pr8,1,25550,10,7678
2,c3,m1,pr7,1,12150,5,946
3,c1,m3,pr8,1,25350,15,6637
4,c5,m6,pr2,0,30100,4,4720


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city         146 non-null    object
 1   manager      146 non-null    object
 2   product      146 non-null    object
 3   promo        149 non-null    int64 
 4   prev_volume  149 non-null    int64 
 5   percent      149 non-null    int64 
 6   volume       149 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 8.3+ KB


In [8]:
threshold = 0.7
# Удаление столбцов с коэффициентом пропущенных значений выше порога
data = data[data.columns[data.isnull().mean() < threshold]]

# Удаление строк с коэффициентом отсутствия значений выше порога
data = data.loc[data.isnull().mean(axis=1) < threshold]

In [9]:
data = data.rename(columns={'volume': 'target'})

In [10]:
numerical_features = []
categorical_features = []

for i_col in data.columns:
    if is_numeric_dtype(data[i_col]):
      numerical_features.append(i_col)
    elif is_string_dtype(data[i_col]):
      categorical_features.append(i_col)

print(numerical_features)
print(categorical_features)

all_features = categorical_features + numerical_features
print("Все типы столбцов распознаны верно:", 
                                  len(all_features) == len(data.columns))

['promo', 'prev_volume', 'percent', 'target']
['city', 'manager', 'product']
Все типы столбцов распознаны верно: True


In [11]:
numerical_features.remove('target')

In [12]:
print(numerical_features)

['promo', 'prev_volume', 'percent']


In [13]:
# Удаление выбросов
# x = 3
# upper_lim = data['column'].mean () + data['column'].std () * x 
# lower_lim = data['column'].mean () - data['column'].std () * x
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

# upper_lim = data['column'].quantile(.95)
# lower_lim = data['column'].quantile(.05)
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

In [14]:
# Ограничение выбросов
for name_col in numerical_features:
  upper_lim = data[name_col].quantile(.95)
  lower_lim = data[name_col].quantile(.05)
  data.loc[(data[name_col] > upper_lim),name_col] = upper_lim
  data.loc[(data[name_col] < lower_lim),name_col] = lower_lim

In [15]:
data_train, data_test = train_test_split(data, test_size=0.2, shuffle=True,random_state=42)
print(data_train.shape, data_test.shape)

(119, 7) (30, 7)


In [16]:
data_train.head(7)

Unnamed: 0,city,manager,product,promo,prev_volume,percent,target
22,c5,m1,pr6,0,11650,2,155
15,c3,m4,pr5,1,59050,4,8404
65,c4,m2,pr6,1,7680,13,4836
11,c4,m3,,1,9600,12,4641
42,c1,m6,pr2,1,40100,6,3164
105,c5,,pr8,0,15150,5,1159
51,c3,m7,pr8,0,14150,11,6413


In [17]:
data_test.head(7)

Unnamed: 0,city,manager,product,promo,prev_volume,percent,target
73,c3,m7,pr4,0,79030,13,44759
18,c2,m3,pr5,1,57850,8,10296
117,c3,m7,pr4,0,79030,6,22463
78,c1,m4,pr10,0,21550,12,5560
76,c2,m7,pr1,0,18150,5,1706
31,c5,,pr2,0,30350,11,6911
64,c4,m2,pr7,1,9500,2,327


In [18]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_train_imp = imp.fit_transform(data_train[categorical_features])
data_train_imp[:10]

array([['c5', 'm1', 'pr6'],
       ['c3', 'm4', 'pr5'],
       ['c4', 'm2', 'pr6'],
       ['c4', 'm3', 'pr6'],
       ['c1', 'm6', 'pr2'],
       ['c5', 'm4', 'pr8'],
       ['c3', 'm7', 'pr8'],
       ['c4', 'm2', 'pr6'],
       ['c5', 'm6', 'pr2'],
       ['c4', 'm5', 'pr10']], dtype=object)

In [19]:
onehotencoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
data_train_ohe = onehotencoder.fit_transform(data_train_imp)

In [20]:
data_train_cat = pd.DataFrame(data_train_ohe,
                             columns=list(np.concatenate(onehotencoder.categories_, axis=0)))

In [21]:
data_train_cat.head()

Unnamed: 0,c1,c2,c3,c4,c5,m1,m2,m3,m4,m5,m6,m7,pr1,pr10,pr2,pr3,pr4,pr5,pr6,pr7,pr8,pr9
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
scaler = MinMaxScaler()
data_train_scaled = scaler.fit_transform(data_train[numerical_features])
data_train_scaled[:10]

array([[0.        , 0.05564121, 0.        ],
       [1.        , 0.71997197, 0.15384615],
       [1.        , 0.        , 0.84615385],
       [1.        , 0.0269096 , 0.76923077],
       [1.        , 0.45437982, 0.30769231],
       [0.        , 0.10469516, 0.23076923],
       [0.        , 0.09067975, 0.69230769],
       [1.        , 0.        , 0.23076923],
       [0.        , 0.31422565, 0.15384615],
       [0.        , 0.        , 0.61538462]])

In [23]:
data_train_num = pd.DataFrame(data_train_scaled, columns = list(scaler.feature_names_in_))

In [24]:
data_train_num.head()

Unnamed: 0,promo,prev_volume,percent
0,0.0,0.0556412053258584,0.0
1,1.0,0.7199719691660827,0.1538461538461538
2,1.0,0.0,0.8461538461538461
3,1.0,0.0269096005606166,0.7692307692307693
4,1.0,0.4543798177995795,0.3076923076923077


In [25]:
data_train_tramsformed = pd.concat([data_train_cat,
                                    data_train_num],axis=1)
data_train_tramsformed.head(7)

Unnamed: 0,c1,c2,c3,c4,c5,m1,m2,m3,m4,m5,m6,m7,pr1,pr10,pr2,pr3,pr4,pr5,pr6,pr7,pr8,pr9,promo,prev_volume,percent
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0556412053258584,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.7199719691660827,0.1538461538461538
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.8461538461538461
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0269096005606166,0.7692307692307693
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.4543798177995795,0.3076923076923077
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.1046951646811492,0.2307692307692307
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0906797477224947,0.6923076923076924


In [26]:
model_lr = LinearRegression()
model_lr.fit(data_train_tramsformed,data_train['target'])
y_pred_train = model_lr.predict(data_train_tramsformed)

In [27]:
print("MSE %.3f" % mean_squared_error(y_pred_train, data_train['target']))
print("MAE %.3f" % mean_absolute_error(y_pred_train,data_train['target']))
print("R2 Score %.3f" % r2_score(y_pred_train, data_train['target']))

MSE 14672464.921
MAE 2717.598
R2 Score 0.574


In [28]:
data_test_imp = imp.fit_transform(data_test[categorical_features])
data_test_ohe = onehotencoder.fit_transform(data_test_imp)
data_test_cat = pd.DataFrame(data_test_ohe,
                             columns=list(np.concatenate(onehotencoder.categories_, axis=0)))
data_test_scaled = scaler.fit_transform(data_test[numerical_features])
data_test_num = pd.DataFrame(data_test_scaled, columns = list(scaler.feature_names_in_))
data_test_tramsformed = pd.concat([data_test_cat,
                                    data_test_num],axis=1)

In [29]:
data_test_tramsformed.head(7)

Unnamed: 0,c1,c2,c3,c4,c5,m1,m2,m3,m4,m5,m6,m7,pr1,pr10,pr2,pr3,pr4,pr5,pr6,pr7,pr8,pr9,promo,prev_volume,percent
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.8461538461538461
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.7031534688156973,0.4615384615384615
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3076923076923077
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1943938332165382,0.7692307692307693
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1467414155571128,0.2307692307692307
5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317729502452698,0.6923076923076924
6,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0255080588647512,0.0


In [30]:
y_pred_test = model_lr.predict(data_test_tramsformed)

In [31]:
print("MSE %.3f" % mean_squared_error(y_pred_test, data_test['target']))
print("MAE %.3f" % mean_absolute_error(y_pred_test,data_test['target']))
print("R2 Score %.3f" % r2_score(y_pred_test, data_test['target']))

MSE 88022400.161
MAE 5632.391
R2 Score -0.516


## Pipeline

In [38]:
numerical_transformer = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown='ignore')),
])

data_transformer = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features),
])

preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

l_reg_pipline = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("l_reg", LinearRegression())])

l_reg_pipline

In [33]:
all_features

['city', 'manager', 'product', 'promo', 'prev_volume', 'percent', 'target']

In [34]:
all_features.remove('target')

In [39]:
l_reg_pipline.fit(data_train[all_features], data_train['target'])

In [40]:
y_pred_test = l_reg_pipline.predict(data_test[all_features])

In [37]:
print("MSE %.3f" % mean_squared_error(y_pred_test, data_test['target']))
print("MAE %.3f" % mean_absolute_error(y_pred_test,data_test['target']))
print("R2 Score %.3f" % r2_score(y_pred_test, data_test['target']))

MSE 87653945.852
MAE 5568.482
R2 Score -0.517
