# 🍷 Wine Quality Prediction using Multiple Linear Regression
## Goal: Predict wine quality (score 0–10) based on physicochemical features.


"""
Wine Quality Dataset used in this project is based on:

P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis (2009).
"Modeling wine preferences by data mining from physicochemical properties".
Decision Support Systems, 47(4):547-553.
https://doi.org/10.1016/j.dss.2009.05.016

Data obtained from the UCI Machine Learning Repository:
https://archive.ics.uci.edu/ml/datasets/wine+quality
"""


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import display

print("Importations complete.")

Importations complete.


## Load Data

In [3]:
df_wqw = pd.read_csv('winequality-white.csv', sep=';')
df_wqw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [4]:
df_wqr = pd.read_csv('winequality-red.csv', sep=';')
df_wqr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [9]:
df_wqw.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1231,7.9,0.41,0.37,4.5,0.03,40.0,114.0,0.992,3.17,0.54,12.4,7
3566,7.6,0.38,0.28,4.2,0.029,7.0,112.0,0.9906,3.0,0.41,12.6,6
3625,7.7,0.32,0.61,11.8,0.041,66.0,188.0,0.99794,3.0,0.54,9.3,5
4716,5.5,0.315,0.38,2.6,0.033,10.0,69.0,0.9909,3.12,0.59,10.8,6
4198,7.9,0.36,0.53,12.9,0.049,63.0,139.0,0.99792,2.94,0.45,9.1,5
3024,7.0,0.13,0.37,12.85,0.042,36.0,105.0,0.99581,3.05,0.55,10.7,6
2631,6.7,0.28,0.28,2.4,0.012,36.0,100.0,0.99064,3.26,0.39,11.7,7
1537,8.1,0.28,0.49,1.0,0.04,32.0,148.0,0.9936,3.13,0.41,10.0,6
3214,5.2,0.335,0.2,1.7,0.033,17.0,74.0,0.99002,3.34,0.48,12.3,6
4691,6.9,0.19,0.31,19.25,0.043,38.0,167.0,0.99954,2.93,0.52,9.1,7


In [10]:
df_wqw.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0
