# Импорт библиотек

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Данные

In [3]:
path = 'winequality-red.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


Все переменные числа и null значений нет.

In [5]:
df.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

Таргетная переменная принимает шесть значений.  
Распределение по классам неравномерное.

# Обучение моделей

Выделим таргетную переменную и признаки.  

In [6]:
X = df.drop('quality', axis=1)
y = df['quality']

Разобъём данные на трейн и тест, учитывая неравномерное распределение по классам.  

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0,
    stratify=y,
)

In [8]:
def test_model(model, X, y):
    y_pred = model.predict(X)
    y_score = model.predict_proba(X)

    accuracy = accuracy_score(y,y_pred)
    f1 = f1_score(y,y_pred, average="weighted")
    roc_auc = roc_auc_score(y, y_score, multi_class="ovo")

    return accuracy, f1, roc_auc

In [9]:
lr_model = LogisticRegression(random_state = 0, solver="sag")
rfc_model = RandomForestClassifier(random_state = 0)
dtc_model = DecisionTreeClassifier(random_state = 0)
gbc_model = GradientBoostingClassifier(random_state = 0)
knn_model = KNeighborsClassifier()

In [10]:
metrics = pd.DataFrame(
    index=[
        ["Accuracy", "Accuracy", "F1", "F1", "ROCAUC", "ROCAUC"],
        ["train", "test", "train", "test", "train", "test"],
    ],
)

In [11]:
for model in [lr_model, rfc_model, dtc_model, gbc_model, knn_model]:
    model.fit(X_train, y_train)
    accuracy_train, f1_train, rocauc_train = test_model(model, X_train, y_train)
    accuracy_test, f1_test, rocauc_test = test_model(model, X_test, y_test)
    metrics[model] = [
        accuracy_train,
        accuracy_test,
        f1_train,
        f1_test,
        rocauc_train,
        rocauc_test,
    ]

In [12]:
metrics.columns = ['LogisticRegression',
    'RandomForestClassifier', 'DecisionTreeClassifier',
    'GradientBoostingClassifier',  'KNeighborsClassifier',
]

In [13]:
metrics

Unnamed: 0,Unnamed: 1,LogisticRegression,RandomForestClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier
Accuracy,train,0.520719,1.0,1.0,0.888194,0.658327
Accuracy,test,0.509375,0.7125,0.603125,0.675,0.475
F1,train,0.473491,1.0,1.0,0.888276,0.643695
F1,test,0.464213,0.691064,0.59971,0.660774,0.458865
ROCAUC,train,0.636827,1.0,1.0,0.994049,0.903822
ROCAUC,test,0.587933,0.838521,0.57898,0.686573,0.588438


# Вывод

У всех моделей, кроме модели LogisticRegression и в некоторой степени у модели KNeighborsClassifier наблюдается переобучение, - метрики на train существенно отличаются, чем на test.  
Самое лучшее качество на test у модели RandomForestClassifier.  
</br>
Для дальнейших шагов выберем две модели у которых есть потенциал:    
- RandomForestClassifier лучшее качество на test, нужно избежать переобучения,
- LogisticRegression train и test не отличаются значительно, нужно повышать качество.