# Задание

Решите задачу классификации по вашему варианту из предыдущей работы следующими тремя методами:
- метод опорных векторов
- дерево принятия решений
- случайный лес

При решении задачи подберите с использованием кросс-проверки следующие параметры алгоритмов:
-  параметр C для SVM
- глубину дерева для дерева принятия решений
- количество оценивателей для случайного леса

Как отработали методы для вашей задачи?
Почему получились такие результаты?

# Теоретические разделы и контрольные вопросы для подготовки к защите

1. Понятие отступа классификатора
1. Максимизация отступа и её отражение в функционале ошибки
1. Функционал ошибки метода опорных векторов и его отличия от логистической регрессии
1. Регулировка размера отступа в SVM
1. Математическое обоснование SVM с точки зрения векторных операций и границы решения
1. Ядра SVM
1. Классификация с использованием опорных точек
1. Факторы, влияющие на выбор ядра и его параметров
1. Деревья принятия решений и основные термины
1. Способ работы готового дерева для решения задач регрессии и классификации
1. Этапы построения дерева
1. Критерии ветвления (помимо основ знать поплярные критерии и использующие их методы - это очень широкий вопрос, из которого может предлагаться рассказать о каком-то критерии в отдельности)
1. Правила остановки
1. Оптимизация уже построенного дерева
1. Недообучение и переобучение деревьев
1. Композиции алгоритмов и предпосылки
1. Метод простого голосования
1. Бэггинг
1. Случайные лес
1. Стэкинг
1. Бустинг
1. Градиентный бустинг: формальная постановка задачи, этапы работы
1. Переобучение градиентного бустинга и методы борьбы с ним
1. Градиентный бустинг в задачах регрессии и классификации

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pandas as pd

drive.mount('/content/drive')
sns.set()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Academic/ML/lab5/data/winequality-red.csv', delimiter=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,bad
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


In [None]:
data['quality'] = data['quality'].map({'bad' : 0, 'good' : 1})
data.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,0


In [None]:
X = data.drop('quality', axis = 1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# SVM с использованием GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Задаем параметры для SVM
svm_params = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}

svm_model = SVC(random_state=42)

svm_grid = GridSearchCV(svm_model, svm_params, cv=5)
svm_grid.fit(X_train, y_train)

best_svm_params = svm_grid.best_params_
print(f'Best SVM Parameters: {best_svm_params}')

best_svm_model = svm_grid.best_estimator_
best_svm_model.fit(X_train, y_train)

svm_predictions = best_svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f'SVM Accuracy: {svm_accuracy}')
print('Classification Report:\n', classification_report(y_test, svm_predictions))


Best SVM Parameters: {'C': 0.1, 'kernel': 'linear'}
SVM Accuracy: 0.853125
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       273
           1       0.00      0.00      0.00        47

    accuracy                           0.85       320
   macro avg       0.43      0.50      0.46       320
weighted avg       0.73      0.85      0.79       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Дерево принятия решений с использованием GridSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {'max_depth': [None, 5, 10, 15, 20]}

dt_model = DecisionTreeClassifier(random_state=42)

dt_grid = GridSearchCV(dt_model, dt_params, cv=5)
dt_grid.fit(X_train, y_train)

best_dt_params = dt_grid.best_params_
print(f'Best Decision Tree Parameters: {best_dt_params}')

best_dt_model = dt_grid.best_estimator_
best_dt_model.fit(X_train, y_train)

dt_predictions = best_dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f'Decision Tree Accuracy: {dt_accuracy}')
print('Classification Report:\n', classification_report(y_test, dt_predictions))

Best Decision Tree Parameters: {'max_depth': 5}
Decision Tree Accuracy: 0.86875
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.93       273
           1       0.59      0.34      0.43        47

    accuracy                           0.87       320
   macro avg       0.74      0.65      0.68       320
weighted avg       0.85      0.87      0.85       320



# Случайный лес с использованием GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]}

rf_model = RandomForestClassifier(random_state=42)

rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
rf_grid.fit(X_train, y_train)

best_rf_params = rf_grid.best_params_
print(f'Best Random Forest Parameters: {best_rf_params}')

best_rf_model = rf_grid.best_estimator_
best_rf_model.fit(X_train, y_train)

rf_predictions = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f'Random Forest Accuracy: {rf_accuracy}')
print('Classification Report:\n', classification_report(y_test, rf_predictions))

Best Random Forest Parameters: {'max_depth': None, 'n_estimators': 150}
Random Forest Accuracy: 0.896875
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       273
           1       0.71      0.51      0.59        47

    accuracy                           0.90       320
   macro avg       0.81      0.74      0.77       320
weighted avg       0.89      0.90      0.89       320

