In [1]:
# импортирование основных библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# отключение назойливых варнингов
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.externals import joblib

In [3]:
red = pd.read_csv('winequality-red.csv', sep=';')
white = pd.read_csv('winequality-white.csv', sep=';')

red['color'] = 'red'
white['color'] = 'white'

df = pd.concat([red, white])
df.index = pd.RangeIndex(len(df))

# выполним dummy кодирование цвета
map_ = {'red': 0, 'white': 1}
df['color'] = df['color'].map(map_)

logs = ['residual sugar', 'free sulfur dioxide', 'sulphates', 'chlorides', 'fixed acidity',
        'volatile acidity']
sqrts = ['citric acid']
df[logs] = df[logs].apply(np.log)
df[sqrts] = df[sqrts].apply(np.sqrt)

scaler = StandardScaler()
X = df.drop(columns='quality').values
y = df['quality'].values
X, y = shuffle(X, y)

X_scaled = scaler.fit_transform(X)

In [4]:
params = {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 450}
gb = GradientBoostingRegressor(**params)

In [5]:
def evaluate_model(model, X, y):
    X, y = shuffle(X, y)
    print(cross_val_score(model, X, y, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    predict = np.round(predict)
    print('accuracy = ', accuracy_score(y_test, predict))
    return y_test, predict

In [6]:
evaluate_model(gb, X_scaled, y)

[-0.40553532 -0.39701766 -0.39119114 -0.3813505  -0.40018966]
accuracy =  0.6712820512820513


(array([5, 5, 6, ..., 5, 5, 7], dtype=int64),
 array([5., 6., 6., ..., 5., 6., 5.]))

реализуем метрику для mae для округлённых значений

In [7]:
def round_mae(model, X, y):
    pred = model.predict(X)
    pred = np.round(pred)
    return mean_absolute_error(y, pred)

In [8]:
cross_val_score(gb, X_scaled, y, scoring=round_mae, cv=5, n_jobs=-1)

array([0.35769231, 0.35384615, 0.34180139, 0.36720554, 0.36104696])

In [9]:
kf = KFold(n_splits=5)
pred_ = []
y_true_ = []
 
KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in tqdm_notebook(kf.split(X_scaled)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gb.fit(X_train, y_train)
    pred = gb.predict(X_test)
    pred = np.round(pred)
    
    pred_.extend(pred)
    y_true_.extend(y_test)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
pred_ = np.array(pred_).reshape(-1)
y_true_ = np.array(y_true_).reshape(-1)

In [11]:
(4434+1856)/len(pred_)

0.9681391411420656

In [12]:
np.unique(np.abs(pred_ - y_true_), return_counts=True)

(array([0., 1., 2., 3., 4.]),
 array([4417, 1889,  171,   19,    1], dtype=int64))

По этой метрике видно, что сколько раз на сколько мы ошиибаемся. В 96% случаев мы либо не ошибаемся, либо огибаемся на 1 уровень. Остальные 4% процента когда модель ошибается более чем на 1 значение по шкале.

Сохраняем модели

In [15]:
gb.fit(X_scaled, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=9, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=450, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
scaler_filename = 'scaler'
joblib.dump(scaler, scaler_filename) 

['scaler']

In [17]:
model_filename = 'model'
joblib.dump(gb, model_filename)

['model']

In [18]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'color'],
      dtype='object')