# Метрики и валидация

Датасеты скачивать по ссылке: https://disk.yandex.ru/d/cwL3Ka4ECyQwpw

In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import yaml

from matplotlib import pyplot as plt
from sklearn.linear_model import (
    ElasticNet,
    ElasticNetCV,
    Lasso,
    LassoCV,
    LinearRegression,
    LogisticRegression,
    Ridge,
    RidgeCV,
)
from sklearn.metrics import (
    RocCurveDisplay,
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    precision_recall_curve,
    precision_score,
    r2_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier

from houses_data_engineering import prepare_houses_data
from mushrooms_data_engineering import prepare_mushrooms_data
from wdbc_data_engineering import prepare_wdbc_data


SEED = 314159
TRAIN_TEST_SPLIT = 0.80


with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

### Данные

Опустим этап инженерии данных, все три датасеты уже были рассмотрены на предыдущих занятиях. Все необходимые трансформации данных вынесены в отдельные .py модули.

#### House prices

In [None]:
df_houses_init = pd.read_csv(cfg["house_prices"]["train_dataset"])
y_houses = df_houses_init["SalePrice"]
df_houses_init.drop(columns=["SalePrice", "Id"], inplace=True)

df_houses_train_init, df_houses_test, y_houses_train, y_houses_test = train_test_split(
    df_houses_init, y_houses, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_houses_train, df_houses_test, y_houses_train, y_houses_test = prepare_houses_data(
    df_train=df_houses_train_init,
    df_test=df_houses_test,
    y_train=y_houses_train,
    y_test=y_houses_test,
)

#### WDBC

In [None]:
df_wdbc = pd.read_csv(cfg["wdbc"])

y_wdbc = df_wdbc[["diagnosis"]].replace({"B": 0, "M": 1})
df_wdbc.drop(columns=["id", "Unnamed: 32", "diagnosis"], inplace=True)

df_wdbc_train, df_wdbc_test, y_wdbc_train, y_wdbc_test = train_test_split(
    df_wdbc, y_wdbc, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_wdbc_train, df_wdbc_test = prepare_wdbc_data(
    df_train=df_wdbc_train, df_test=df_wdbc_test
)

#### Mushrooms

In [None]:
df_mushrooms = pd.read_csv(cfg["mushrooms"])

y_mushrooms = df_mushrooms[["class"]].replace({"e": 0, "p": 1})
df_mushrooms.drop(columns=["class"], inplace=True)

(
    df_mushrooms_train,
    df_mushrooms_test,
    y_mushrooms_train,
    y_mushrooms_test,
) = train_test_split(
    df_mushrooms, y_mushrooms, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_mushrooms_train, df_mushrooms_test = prepare_mushrooms_data(
    df_train=df_mushrooms_train, df_test=df_mushrooms_test
)

### Моделирование и анализ

Для начала рассмотрим API sklearn для проведения кросс-валидации.

Гайд по схемам кросс-валидации, реализованных в sklearn: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

API: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection

Основные функции для проведения кросс-валидации: `cross_validate`, `cross_val_predict` и `cross_val_score`.

Важно отметить аргумент `cv`: это может быть как целое количество фолдов (по сути _K_), один из рассмотренных CV-splitter'ов, так и кастомный итератор, который возвращает индексы для тренировочной и тестовой части разбиения.

In [None]:
linreg_cv_res = cross_validate(
    estimator=LinearRegression(),
    X=df_houses_train,
    y=y_houses_train,
    scoring=["r2", "neg_mean_absolute_percentage_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=3),
)

`cross_validate` возвращает не только значения метрик (можно передавать целый список), но и время, затраченное на обучение и вычисление метрик.

In [None]:
linreg_cv_res

In [None]:
linreg_cv_pred_res = cross_val_predict(
    estimator=LinearRegression(),
    X=df_houses_train,
    y=y_houses_train,
)

Как уже было отмечено, для k-fold CV каждое наблюдение попадает в отложенную часть лишь раз. `cross_val_predict` возвращает эти предсказания.

In [None]:
linreg_cv_pred_res.shape, linreg_cv_pred_res

In [None]:
linreg_cv_score_res = cross_val_score(
    estimator=LinearRegression(),
    X=df_houses_train,
    y=y_houses_train,
    cv=5,
    scoring="r2",
)

`cross_val_score` возвращает значения (лишь одной) метрики на 

In [None]:
linreg_cv_score_res

Что касается применения кросс-валидации для подбора гиперпараметров моделей, то в sklearn реализованы 
- `GridSearchCV` -- все наборы из декартового произведения значений для каждого параметра.
- `ParameterGrid` -- задание набора параметров.
- `RandomSearchCV` -- сэмплирование параметров с учетом ограничения на количество вариантов.

Пример: 
```python
from scipy.stats import uniform
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
```

`RandomSearchCV` практически всегда предпочтительнее `GridSearchCV`. (привести пример с квадратом)

За более сложными способами поиска гиперпараметров вроде байесовской оптимизации придется идти в другие библиотеки: hyperopt, optuna, etc.

In [None]:
lasso_gs_cv = GridSearchCV(
    estimator=Lasso(random_state=SEED),
    param_grid={"alpha": [0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 10.0]},
    n_jobs=4,
    scoring="r2",
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lasso_gs_cv.fit(df_houses_train, y_houses_train)

In [None]:
lasso_gs_cv.cv_results_

In [None]:
lasso_gs_cv.best_params_, lasso_gs_cv.best_score_

В sklearn большинство линейных моделей имеют CV-версии, которые по своей сути являются синтаксическим сахаром для запуска `GridSearchCV`.

In [None]:
linreg = LinearRegression()
ridge_001 = Ridge(alpha=0.01, random_state=SEED)
lasso_1 = Lasso(alpha=1.0, random_state=SEED)
elastic_net_1_1 = ElasticNet(alpha=1.0, l1_ratio=1.0, random_state=SEED)

# те же модели, но со встроенной возможностью проведения кросс-валидации
lasso_cv = LassoCV(cv=RepeatedKFold(n_splits=5, n_repeats=3), random_state=SEED)
ridge_cv = RidgeCV(cv=RepeatedKFold(n_splits=5, n_repeats=3))
elastic_net_cv = ElasticNetCV(
    cv=RepeatedKFold(n_splits=5, n_repeats=3), random_state=SEED, max_iter=10000, 
)
# добавить после первого запуска l1_ratio=1.0,  alphas=[0.01, 0.1, 1.0]

models = dict(
    zip(
        [
            "linreg",
            "ridge_001",
            "ridge_cv",
            "lasso_1",
            "lasso_cv",
            "elastic_net_1_1",
            "elastic_net_cv",
        ],
        [
            linreg,
            ridge_001,
            ridge_cv,
            lasso_1,
            lasso_cv,
            elastic_net_1_1,
            elastic_net_cv,
        ],
    )
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for model in models.values():
        model.fit(df_houses_train, y_houses_train)

In [None]:
y_houses_preds = {}
for model_name, model in models.items():
    y_houses_preds[model_name] = model.predict(df_houses_test)

data = []
for model_name, y_pred in y_houses_preds.items():
    data.append(
        [
            model_name,
            np.sqrt(mean_squared_error(y_true=y_houses_test, y_pred=y_pred)),
            mean_absolute_error(y_true=y_houses_test, y_pred=y_pred),
            r2_score(y_true=y_houses_test, y_pred=y_pred),
            mean_absolute_percentage_error(y_true=y_houses_test, y_pred=y_pred),
        ]
    )

df_res = pd.DataFrame(data, columns=["model_name", "RMSE", "MAE", "R2", "MAPE"])

In [None]:
df_res.sort_values(by="R2", ascending=False)

#### WDBC

In [None]:
knn_wdbc_cv = RandomizedSearchCV(
    estimator=KNeighborsClassifier(),
    param_distributions={
        "n_neighbors": [1, 3, 5, 7, 10, 15, 20, 25, 50],
        "metric": ["euclidean"],
        "weights": ["uniform", "distance"],
    },
    n_jobs=4,
    random_state=SEED,
    scoring="f1",
    error_score="raise",
)
knn_wdbc_cv.fit(X=df_wdbc_train.values, y=y_wdbc_train.values.reshape(-1))

knn_wdbc_cv.best_params_, knn_wdbc_cv.best_score_

In [None]:
print(
    "kNN:",
    f1_score(y_true=y_wdbc_test, y_pred=knn_wdbc_cv.predict(df_wdbc_test.values))
)

Для сравнения набор, который использовался на прошлом занятии.

In [None]:
knn_wdbc = KNeighborsClassifier(
    n_neighbors=5, metric="euclidean", algorithm="brute", weights="uniform"
)
knn_wdbc.fit(X=df_wdbc_train.values, y=y_wdbc_train.values.reshape(-1))
y_wdbc_knn_pred = knn_wdbc.predict(X=df_wdbc_test.values)
print("kNN:", f1_score(y_true=y_wdbc_test, y_pred=y_wdbc_knn_pred))

In [None]:
logreg_wdbc_cv = RandomizedSearchCV(
    estimator=LogisticRegression(random_state=SEED),
    param_distributions={
        "penalty": ["elasticnet"],
        "C": [0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 10.0],
        "l1_ratio": [0.0, 0.001, 0.01, 0.1, 1.0],
        "solver": ["saga"],
    },
    n_jobs=4,
    random_state=SEED,
    scoring="f1",
    error_score="raise",
)
logreg_wdbc_cv.fit(X=df_wdbc_train.values, y=y_wdbc_train.values.reshape(-1))

logreg_wdbc_cv.best_params_, logreg_wdbc_cv.best_score_

In [None]:
print(
    "logreg:",
    f1_score(y_true=y_wdbc_test, y_pred=logreg_wdbc_cv.predict(df_wdbc_test.values))
)

In [None]:
logreg_wdbc = LogisticRegression(
    C=1.0, penalty="elasticnet", l1_ratio=0.1, solver="saga"
)
logreg_wdbc.fit(X=df_wdbc_train, y=y_wdbc_train.values.reshape(-1))

# AttributeError: 'Flags' object has no attribute 'c_contiguous'
y_wdbc_logreg_pred = logreg_wdbc.predict(df_wdbc_test)

print("logreg:", f1_score(y_true=y_wdbc_test, y_pred=y_wdbc_logreg_pred))

#### Mushrooms

In [None]:
knn_mushrooms_cv = RandomizedSearchCV(
    estimator=KNeighborsClassifier(),
    param_distributions={
        "n_neighbors": [1, 3, 5, 7, 10, 15, 20, 25, 50],
        "metric": ["euclidean"],
        "weights": ["uniform", "distance"],
    },
    n_jobs=4,
    random_state=SEED,
    scoring="f1",
    error_score="raise",
)
knn_mushrooms_cv.fit(
    X=df_mushrooms_train.values, y=y_mushrooms_train.values.reshape(-1)
)

knn_mushrooms_cv.best_params_, knn_mushrooms_cv.best_score_

In [None]:
print(
    "kNN:",
    f1_score(
        y_true=y_mushrooms_test,
        y_pred=knn_mushrooms_cv.predict(X=df_mushrooms_test.values),
    )
)

In [None]:
knn_mushrooms = KNeighborsClassifier(
    n_neighbors=5, metric="cosine", algorithm="brute", weights="uniform",
)
knn_mushrooms.fit(X=df_mushrooms_train, y=y_mushrooms_train.values.reshape(-1))
y_mushrooms_knn_pred = knn_mushrooms.predict(X=df_mushrooms_test.values)
print("kNN:", f1_score(y_true=y_mushrooms_test, y_pred=y_mushrooms_knn_pred))

In [None]:
logreg_mushrooms_cv = RandomizedSearchCV(
    estimator=LogisticRegression(random_state=SEED),
    param_distributions={
        "penalty": ["elasticnet"],
        "C": [0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 10.0],
        "l1_ratio": [0.0, 0.001, 0.01, 0.1, 1.0],
        "solver": ["saga"],
    },
    n_jobs=4,
    random_state=SEED,
    scoring="f1",
    error_score="raise",
)
logreg_mushrooms_cv.fit(
    X=df_mushrooms_train.values, y=y_mushrooms_train.values.reshape(-1)
)

logreg_mushrooms_cv.best_params_, logreg_mushrooms_cv.best_score_

In [None]:
print(
    "logreg:",
    f1_score(
        y_true=y_mushrooms_test,
        y_pred=logreg_mushrooms_cv.predict(df_mushrooms_test.values),
    )
)

In [None]:
logreg_mushrooms = LogisticRegression(
    C=1.0, penalty="elasticnet", l1_ratio=0.1, solver="saga", random_state=SEED
)
logreg_mushrooms.fit(X=df_mushrooms_train, y=y_mushrooms_train.values.reshape(-1))

# AttributeError: 'Flags' object has no attribute 'c_contiguous'
y_mushrooms_pred = logreg_mushrooms.predict(X=df_mushrooms_test)
print("logreg:", f1_score(y_true=y_mushrooms_test, y_pred=y_mushrooms_pred))

#### Метрики классификации

In [None]:
y_logreg_wdbc_pred_probas = logreg_wdbc_cv.predict_proba(df_wdbc_test.values)
y_logreg_wdbc_pred = logreg_wdbc_cv.predict(df_wdbc_test.values)

In [None]:
print(confusion_matrix(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))

In [None]:
print(classification_report(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))

In [None]:
print("accuracy:", accuracy_score(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))
print(
    "balanced accuracy:",
    balanced_accuracy_score(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred),
)
print("precision:", precision_score(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))
print("recall:", balanced_accuracy_score(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))
print("f1:", f1_score(y_true=y_wdbc_test, y_pred=y_logreg_wdbc_pred))
print(
    "roc auc:",
    roc_auc_score(y_true=y_wdbc_test, y_score=y_logreg_wdbc_pred_probas[:, 1]),
)

In [None]:
RocCurveDisplay.from_predictions(
    y_wdbc_test,
    y_logreg_wdbc_pred_probas[:, 1],
    color="darkorange",
    plot_chance_level=True,
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()