# Оценка качества вина
## Крюков Дмитрий Алексеевич

### 0. Описание задачи

Имеется датасет содержаший физико-химические показатели некоторых вин. По имеющимся данным предсказать оценку качества вина произведенную специалистами.

### 1.Загрузка данных

In [44]:
import numpy as np              
import matplotlib.pyplot as plt
%matplotlib inline 
import pandas as pd
import seaborn as sns
import sklearn

In [45]:
data_path = "./winequalityN.csv"
data_raw = pd.read_csv(data_path)

### 2. Визуализация данных

In [46]:
data_raw.shape

(6497, 13)

Датасет содержит 6497 записей, 13 признаков

In [47]:
data_raw.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Значение признаков таблицы

1. type
Тип вина(белое или красное)

2. fixed acidity  
Фиксированная кислотность вина, обусловлена наличем неиспаряемых кислот  

3. volatile acidity  
Количество испаряемых кислот в вине  

4. citric acid  
Количество лимонной кислоты в вине  

5. residual sugar  
Количество сахара, оставшегося после прекращения брожения  

6. chlorides  
Количество соли в вине  

7. free sulfur dioxide  
Свободная форма SO2, находится в равновесии между молекулярной формой и формой бисульфатного иона  

8. total sulfur dioxide  
Количество свободных и связанных форм S02  

9. density  
Плотность  

10. pH  
Описывает, насколько кислое или щелочное вино по шкале от 0 до 14  

11. sulphates  
Содержание сульфатов  

12. alcohol  
Процентное содержание алкоголя в вине  

13. quality  
Выходная оценка качества вина

In [48]:
data_raw.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6487.0,6489.0,6494.0,6495.0,6495.0,6497.0,6497.0,6497.0,6488.0,6493.0,6497.0,6497.0
mean,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378
std,1.29675,0.164649,0.145265,4.758125,0.035036,17.7494,56.521855,0.002999,0.160748,0.148814,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


* Количество значений для разных столбцов различно, в таблице имеются отсутсвующие значения
* Среднее качество вина 5.8, стандартное отклонение 0.87. Большая часть значений находится в иапазоне от 5 до 7
* Стандартное отклонение параметра плотность равно 0.002999, значения отличаются несущественно

### 3. Обработатка пропущенных значений

In [49]:
data_raw.isna().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

В наборе данных имеются отсутствующие значения, однако их количество крайне мало. Удаление отсутсвующих значений никак не повлияет на конечную эффективность модели, так что неаболее разумно будет отбросить их

In [50]:
data_nonan = data_raw.dropna()

In [51]:
data_nonan.shape

(6463, 13)

Отброшено 34 значения

In [52]:
data_nonan.isna().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### 4. Обработка категориальных признаков

В наборе данных имеется всего один категориальный признак - тип вина.
Данный бинарный признак заменим на числовой

In [53]:
data = data_nonan.copy()
data['type'] = pd.factorize(data_nonan['type'])[0]

In [54]:
data

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6491,1,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
6492,1,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6494,1,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,1,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


### 5. Нормализация

Диапазон значений данных довольно высок, следовательно имеет смысл нормализовать значения признаков

Воспользуемся приведением к отрезку $[0,1]$

In [65]:
to_normalize = data.drop(['type', 'quality'], axis=1)
normalized_data=(to_normalize-to_normalize.min())/(to_normalize.max()-to_normalize.min())

In [66]:
normalized_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.264463,0.126667,0.216867,0.308282,0.059801,0.152778,0.377880,0.267785,0.217054,0.129213,0.115942
1,0.206612,0.146667,0.204819,0.015337,0.066445,0.045139,0.290323,0.132832,0.449612,0.151685,0.217391
2,0.355372,0.133333,0.240964,0.096626,0.068106,0.100694,0.209677,0.154039,0.418605,0.123596,0.304348
3,0.280992,0.100000,0.192771,0.121166,0.081395,0.159722,0.414747,0.163678,0.364341,0.101124,0.275362
4,0.280992,0.100000,0.192771,0.121166,0.081395,0.159722,0.414747,0.163678,0.364341,0.101124,0.275362
...,...,...,...,...,...,...,...,...,...,...,...
6491,0.247934,0.360000,0.048193,0.019939,0.098007,0.093750,0.073733,0.181222,0.542636,0.337079,0.217391
6492,0.198347,0.346667,0.048193,0.021472,0.134551,0.107639,0.087558,0.150183,0.565891,0.202247,0.362319
6494,0.206612,0.286667,0.078313,0.026074,0.111296,0.097222,0.078341,0.166377,0.542636,0.297753,0.434783
6495,0.173554,0.376667,0.072289,0.021472,0.109635,0.107639,0.087558,0.161172,0.658915,0.275281,0.318841


In [67]:
normalized_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0,6463.0
mean,0.282459,0.173059,0.192023,0.074294,0.078166,0.102489,0.252752,0.146283,0.386304,0.174803,0.361279
std,0.107266,0.109759,0.087501,0.072958,0.058265,0.061663,0.130246,0.057865,0.124535,0.083659,0.172917
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.214876,0.1,0.150602,0.018405,0.048173,0.055556,0.163594,0.100636,0.302326,0.117978,0.217391
50%,0.264463,0.14,0.186747,0.03681,0.063123,0.097222,0.258065,0.14999,0.379845,0.162921,0.333333
75%,0.322314,0.213333,0.23494,0.115031,0.093023,0.138889,0.345622,0.190669,0.465116,0.213483,0.478261
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 6. Разбиение данные на обучающую и тестовую выборки

In [83]:
normalized_data.join(data['type'])

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0.264463,0.126667,0.216867,0.308282,0.059801,0.152778,0.377880,0.267785,0.217054,0.129213,0.115942,0
1,0.206612,0.146667,0.204819,0.015337,0.066445,0.045139,0.290323,0.132832,0.449612,0.151685,0.217391,0
2,0.355372,0.133333,0.240964,0.096626,0.068106,0.100694,0.209677,0.154039,0.418605,0.123596,0.304348,0
3,0.280992,0.100000,0.192771,0.121166,0.081395,0.159722,0.414747,0.163678,0.364341,0.101124,0.275362,0
4,0.280992,0.100000,0.192771,0.121166,0.081395,0.159722,0.414747,0.163678,0.364341,0.101124,0.275362,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6491,0.247934,0.360000,0.048193,0.019939,0.098007,0.093750,0.073733,0.181222,0.542636,0.337079,0.217391,1
6492,0.198347,0.346667,0.048193,0.021472,0.134551,0.107639,0.087558,0.150183,0.565891,0.202247,0.362319,1
6494,0.206612,0.286667,0.078313,0.026074,0.111296,0.097222,0.078341,0.166377,0.542636,0.297753,0.434783,1
6495,0.173554,0.376667,0.072289,0.021472,0.109635,0.107639,0.087558,0.161172,0.658915,0.275281,0.318841,1


In [85]:
X = normalized_data.join(data['type'])
y = data['quality']

In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 69)

In [87]:
X_train.shape, X_test.shape

((4847, 12), (1616, 12))

### 7. Классификатор ближайших соседей

In [136]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

### 8. Ошибки на обучающей и тестовой выборках.

In [137]:
err_train = np.mean(y_train != knn.predict(X_train))
err_test  = np.mean(y_test  != knn.predict(X_test))

print(err_train, err_test)

0.2820301217247782 0.41398514851485146


In [146]:
from sklearn.metrics import confusion_matrix

from sklearn.metrics import confusion_matrix
conf_matrix_baseline = pd.DataFrame(confusion_matrix(y_test, knn.predict(X_test)), index = ['3', '4', '5', '6', '7', '8', '9'], columns = ['3', '4', '5', '6', '7', '8', '9'])
display(conf_matrix_baseline)

Unnamed: 0,3,4,5,6,7,8,9
3,0,2,7,3,0,0,0
4,0,8,27,19,2,0,0
5,0,8,380,133,8,3,0
6,1,6,183,434,64,3,0
7,0,0,19,137,121,4,0
8,0,0,2,22,15,4,0
9,0,0,0,1,0,0,0


### 9. Общие выводы
Точночть модели при классификации на 10 классах составляет 0.6. Классификатор чаще всего ошиибается при классификации значений качества 5, 6, 7. Это является ожидаемым результатом, так как оценка качества вина производилась людьми и не является в абсолютной степени объективной. Попадание обьекта в одну из двух соседних категорий часто может зависить от внешних факторов и не может быть учтено в модели. Также по матрице ошибок видно что при объединении категорий качества вина в три категории (например: 1-4, 5-7, 8-9) можно повысить частоту угадываний до 0.8-0.9. Так же имеет смысл рассмотрение задачи в качестве задачи регрессии. 