In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report, accuracy_score
import time
import matplotlib.pyplot as plt
import seaborn as sns

# Константы
my_seed = 10

# Загрузка данных
url = "https://raw.githubusercontent.com/ania607/ML/refs/heads/main/data/winequality-red_for_lab.csv"
DF_raw = pd.read_csv(url)


print("Размерность данных:", DF_raw.shape)
print("\nПервые 5 строк:")
print(DF_raw.head())
print("\nИнформация о данных:")
print(DF_raw.info())
print("\nСтатистика:")
print(DF_raw.describe())


print("\nРаспределение целевой переменной (quality):")
print(DF_raw['quality'].value_counts().sort_index())

Размерность данных: (1599, 13)

Первые 5 строк:
   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Y  
0      9.4   

In [2]:
X = DF_raw.iloc[:, :-1]
y = DF_raw.iloc[:, -1]
if len(np.unique(y)) > 5:
    y = (y > 5).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=my_seed,
    stratify=y  # сохраняем пропорции классов
)

print(f"\nОбучающая выборка: {X_train.shape[0]}")
print(f"Тестовая выборка: {X_test.shape[0]}")
print(f"Распределение классов в обучающей выборке:\n{y_train.value_counts()}")


Обучающая выборка: 1359
Тестовая выборка: 240
Распределение классов в обучающей выборке:
Y
1    727
0    632
Name: count, dtype: int64


In [3]:
pipe_svm = make_pipeline(
    StandardScaler(),
    PCA(),
    SVC(random_state=my_seed)
)

param_grid_svm = [
    {
        'pca__n_components': [2, 3, 5, 7, None],
        'svc__C': [0.1, 1, 10, 100],
        'svc__kernel': ['linear', 'rbf'],
        'svc__gamma': ['scale', 'auto']
    }
]

kfold = KFold(n_splits=5, random_state=my_seed, shuffle=True)
grid_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=param_grid_svm,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)
tic = time.time()
grid_svm.fit(X_train, y_train)
toc = time.time()
print(f"Лучшая точность SVM: {grid_svm.best_score_:.3f}")

Лучшая точность SVM: 1.000


In [4]:
pipe_rf = make_pipeline(
    StandardScaler(),
    PCA(),
    RandomForestClassifier(random_state=my_seed)
)
param_grid_rf = {
    'pca__n_components': [2, 3, 5, 7, None],
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
    'randomforestclassifier__min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid_rf,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)
tic = time.time()
grid_rf.fit(X_train, y_train)
toc = time.time()
print(f"Лучшая точность Random Forest: {grid_rf.best_score_:.3f}")

Лучшая точность Random Forest: 0.986


In [5]:
models = {
    'SVM с PCA': grid_svm,
    'Random Forest с PCA': grid_rf
}

best_model_name = None
best_score = 0

for name, model in models.items():
    score = model.best_score_
    print(f"{name}: {score:.3f}")

    if score > best_score:
        best_score = score
        best_model_name = name
        best_model = model

print(f"\nЛучшая модель: {best_model_name} с точностью {best_score:.3f}")

SVM с PCA: 1.000
Random Forest с PCA: 0.986

Лучшая модель: SVM с PCA с точностью 1.000


In [6]:
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00       128

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240

