In [89]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [90]:
data = pd.read_csv('winequality-red.csv')
MSE_data = pd.DataFrame({'Test_MSE':[], 'Train_MSE':[]})

In [91]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [92]:
X = data.iloc[:,:-1]
y = data['quality']

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [95]:
print("Test Score:", lin_reg.score(X_test, y_test))
print("Train Score:", lin_reg.score(X_train, y_train))

Test Score: 0.40318034127962266
Train Score: 0.347992619352986


## 2-degree polynomial

In [96]:
X_2 = X.copy()

X_2[X.columns + '_2'] = X**2

In [97]:
X_2.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,volatile acidity_2,citric acid_2,residual sugar_2,chlorides_2,free sulfur dioxide_2,total sulfur dioxide_2,density_2,pH_2,sulphates_2,alcohol_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.49,0.0,3.61,0.005776,121.0,1156.0,0.995605,12.3201,0.3136,88.36
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0.7744,0.0,6.76,0.009604,625.0,4489.0,0.99361,10.24,0.4624,96.04
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0.5776,0.0016,5.29,0.008464,225.0,2916.0,0.994009,10.6276,0.4225,96.04
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0.0784,0.3136,3.61,0.005625,289.0,3600.0,0.996004,9.9856,0.3364,96.04
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.49,0.0,3.61,0.005776,121.0,1156.0,0.995605,12.3201,0.3136,88.36


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2, random_state=42)

In [99]:
poly_2 = LinearRegression().fit(X_train, y_train)

In [100]:
print("Test Score:", poly_2.score(X_test, y_test))
print("Train Score:", poly_2.score(X_train, y_train))

Test Score: 0.4175903681315146
Train Score: 0.3763887043535318


## 3-degree polynomial

In [101]:
X_3 = X_2.copy()

X_3[X.columns + '_3'] = X**3

In [102]:
X_3.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,volatile acidity_3,citric acid_3,residual sugar_3,chlorides_3,free sulfur dioxide_3,total sulfur dioxide_3,density_3,pH_3,sulphates_3,alcohol_3
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.343,0.0,6.859,0.000439,1331.0,39304.0,0.993415,43.243551,0.175616,830.584
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0.681472,0.0,17.576,0.000941,15625.0,300763.0,0.990431,32.768,0.314432,941.192
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0.438976,6.4e-05,12.167,0.000779,3375.0,157464.0,0.991027,34.645976,0.274625,941.192
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0.021952,0.175616,6.859,0.000422,4913.0,216000.0,0.994012,31.554496,0.195112,941.192
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.343,0.0,6.859,0.000439,1331.0,39304.0,0.993415,43.243551,0.175616,830.584


In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y, test_size=0.2, random_state=42)

In [104]:
poly_3 = LinearRegression().fit(X_train, y_train)

In [105]:
print("Test Score:", poly_3.score(X_test, y_test))
print("Train Score:", poly_3.score(X_train, y_train))

Test Score: 0.43096154576735046
Train Score: 0.3949504674976797


## 4-degree polynomial

In [106]:
X_4 = X_3.copy()

X_4[X.columns + '_4'] = X**4

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y, test_size=0.2, random_state=42)

In [108]:
poly_4 = LinearRegression().fit(X_train, y_train)

In [109]:
print("Test Score:", poly_4.score(X_test, y_test))
print("Train Score:", poly_4.score(X_train, y_train))

Test Score: 0.39076818893469145
Train Score: 0.4048879875469389


## 5-degree polynomial

In [110]:
X_5 = X_4.copy()

X_5[X.columns + '_5'] = X**5

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y, test_size=0.2, random_state=42)

In [112]:
poly_5 = LinearRegression().fit(X_train, y_train)

In [113]:
print("Test Score:", poly_5.score(X_test, y_test))
print("Train Score:", poly_5.score(X_train, y_train))

Test Score: 0.35250440319645027
Train Score: 0.40762837434729515


## 6-degree polynimial

In [114]:
X_6 = X_5.copy()

X_6[X.columns + '_6'] = X**6

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y, test_size=0.2, random_state=42)

In [116]:
poly_6 = LinearRegression().fit(X_train, y_train)

In [117]:
print("Test Score:", poly_6.score(X_test, y_test))
print("Train Score:", poly_6.score(X_train, y_train))

Test Score: -0.15389040738297233
Train Score: 0.4086997031773927


As the degree of the polynomial increases, the score for the training data increases. For the testing data, the score increases at first, but eventually it decreases at the fourth degree polynonail. 