In [2]:
import pandas as pd
import numpy as np
 
wine = pd.read_csv("winequality-red.csv", sep=";")
wine.head

<bound method NDFrame.head of       fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
5               7.4             0.660         0.00             1.8      0.075   
6               7.9             0.600         0.06             1.6      0.069   
7               7.3             0.650         0.00             1.2      0.065   
8               7.8             0.580         0.02             2.0      0.073   
9               7.5             0.500         0.36             6.1      0.071   
10              6.7             0.580         0.08             1.8      0.097  

In [8]:
# http://pythondatascience.plavox.info/scikit-learn/%E7%B7%9A%E5%BD%A2%E5%9B%9E%E5%B8%B0
# sklearn.linear_model.LinearRegression クラスを読み込み
from sklearn import linear_model
clf = linear_model.LinearRegression()
 
# 説明変数に "density (濃度)" を利用
X = wine.loc[:, ['density']].as_matrix()
 
# 目的変数に "alcohol (アルコール度数)" を利用
Y = wine['alcohol'].as_matrix()
 
# 予測モデルを作成
clf.fit(X, Y)
 
# 回帰係数
print(clf.coef_)
 
# 切片 (誤差)
print(clf.intercept_)
 
# 決定係数
print(clf.score(X, Y))


# res [alcohol] = -280.16382307 × [density] + 289.675343383

[-280.16382307]
289.675343383
0.246194364397


In [9]:
# matplotlib パッケージを読み込み
import matplotlib.pyplot as plt
 
# 散布図
plt.scatter(X, Y)
 
# 回帰直線
plt.plot(X, clf.predict(X))

[<matplotlib.lines.Line2D at 0x7f5994214fd0>]

In [10]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
 
# 説明変数に "quality (品質スコア以外すべて)" を利用
wine_except_quality = wine.drop("quality", axis=1)
X = wine_except_quality.as_matrix()
 
# 目的変数に "quality (品質スコア)" を利用
Y = wine['quality'].as_matrix()
 
# 予測モデルを作成
clf.fit(X, Y)
 
# 偏回帰係数
print(pd.DataFrame({"Name":wine_except_quality.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 (誤差)
print(clf.intercept_)

    Coefficients                  Name
7     -17.881164               density
4      -1.874225             chlorides
1      -1.083590      volatile acidity
8      -0.413653                    pH
2      -0.182564           citric acid
6      -0.003265  total sulfur dioxide
5       0.004361   free sulfur dioxide
3       0.016331        residual sugar
0       0.024991         fixed acidity
10      0.276198               alcohol
9       0.916334             sulphates
21.9652084495


In [12]:
#[quality] = -17.881164 × [density] + -1.874225 × [chlorides] +
#            -1.083590 × [volatile acidity] + -0.413653 × [pH] + 
#            -0.182564 × [citric acid] + -0.003265 × [total sulfur dioxide] + 
#            0.004361 × [free sulfur dioxide] + 0.016331 × [residual sugar] + 
#            0.024991 × [fixed acidity] + 0.276198 × [alcohol] + 
#            0.916334 × [sulphates] + 21.9652084495

In [13]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
 
# データフレームの各列を正規化
wine2 = wine.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
wine2.head()
 
# 説明変数に "quality (品質スコア以外すべて)" を利用
wine2_except_quality = wine2.drop("quality", axis=1)
X = wine2_except_quality.as_matrix()
 
# 目的変数に "quality (品質スコア)" を利用
Y = wine2['quality'].as_matrix()
 
# 予測モデルを作成
clf.fit(X, Y)
 
# 偏回帰係数
print(pd.DataFrame({"Name":wine2_except_quality.columns,
                    "Coefficients":np.abs(clf.coef_)}).sort_values(by='Coefficients') )
 
# 切片 (誤差)
print(clf.intercept_)

    Coefficients                  Name
2       0.036513           citric acid
3       0.047687        residual sugar
7       0.048708               density
0       0.056479         fixed acidity
5       0.061931   free sulfur dioxide
8       0.105068                    pH
6       0.184775  total sulfur dioxide
4       0.224532             chlorides
9       0.306056             sulphates
1       0.316408      volatile acidity
10      0.359057               alcohol
1.27015586565e-15
