# 從白酒所含化學物質多寡來區分白酒的品質

In [125]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
mpl.rc('font', family='Noto Sans CJK TC')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [126]:
wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
                   sep = ';')

In [127]:
wine.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


# 檢查wine的資料是否有缺值

In [128]:
wine.isnull().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

# 定x為化學物質的資料，y為品質的資料

In [129]:
x = wine.iloc[:,:11].values
y = wine.quality.values

# 將資料分成70%訓練資料，30%測試資料

In [130]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.3, random_state = 666)

# 開始做線性迴歸

In [131]:
regr = LinearRegression()

In [132]:
regr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# 預測結果不盡理想(準確率僅有28.84%)

初步推測可能是由於資料分布並未接近一條直線，亦或是x的資料類別有些有相關性

In [133]:
y_pred = regr.predict(x_test)

In [134]:
regr.score(x_test,y_test)

0.28841354216738946

# 由於品質是factor的資料類型，將預測出來的y四捨五入到整數位表其分類出的品質

In [135]:
y_pred = np.round(y_pred)

In [136]:
y_pred = [int(x) for x in y_pred]

In [137]:
y_pred = np.asarray(y_pred)

# 以表格的方式呈現分類的結果，其中品質的取值為0~10的整數

首先，定義一個取得index的函數

In [138]:
def get_index(num, xarray):
    return [i for (y, i) in zip(xarray, range(len(xarray))) if num == y]

In [139]:
wine_table = np.zeros((11,11))
for i in range(11):
    for j in range(11):
        wine_table[i,j] = len(set(get_index(i, y_test)).intersection(get_index(j, y_pred)))

# 縱軸是實際的品質，橫軸是被分類的品質

由表中可知有264筆資料實際品質為5，而被分類為品質6

In [140]:
dfwine = pd.DataFrame(wine_table, columns = range(0,11))

In [141]:
dfwine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,3.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,27.0,26.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,175.0,264.0,5.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,90.0,493.0,66.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,7.0,186.0,64.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,1.0,34.0,22.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# 由於實際品質僅取值於3~9，我們重點標示分類結果

In [142]:
dfwine_mark = dfwine.iloc[3:10,3:10]

In [143]:
dfwine_mark

Unnamed: 0,3,4,5,6,7,8,9
3,0.0,1.0,3.0,4.0,0.0,0.0,0.0
4,0.0,1.0,27.0,26.0,0.0,0.0,0.0
5,0.0,0.0,175.0,264.0,5.0,0.0,0.0
6,0.0,0.0,90.0,493.0,66.0,0.0,0.0
7,0.0,0.0,7.0,186.0,64.0,0.0,0.0
8,0.0,0.0,1.0,34.0,22.0,0.0,0.0
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# 計算對角線數值之和(分對的個數)除上總個數

# 此分法的準確率有49.864%

In [144]:
float(sum(np.diag(dfwine_mark)))/float(sum([sum(dfwine_mark.iloc[i,:]) for i in range(0,7)]))

0.49863945578231295