### Import files

In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [2]:
#url = 'https://github.com/gian793/ML/blob/main/Projects/08%20-%20Wine%20Quality/winequality-red.csv'
url_red = 'winequality-red.csv'
url_wht = 'winequality-white.csv'

df  = pd.read_csv(url_red, sep = ';')
dfw = pd.read_csv(url_wht, sep = ';')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Split data

df_train = df.sample(frac=0.7, random_state=0)
df_valid = df.drop(df_train.index)              # Drop all indexes used in df_train

print(df_train.shape)
print(df_valid.shape)

(1119, 12)
(480, 12)


In [5]:
# Create a list from existing data
col = list(df_train.columns)
print(col)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [6]:
df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1109,10.8,0.47,0.43,2.1,0.171,27.0,66.0,0.9982,3.17,0.76,10.8,6
1032,8.1,0.82,0.0,4.1,0.095,5.0,14.0,0.99854,3.36,0.53,9.6,5
1002,9.1,0.29,0.33,2.05,0.063,13.0,27.0,0.99516,3.26,0.84,11.7,7
487,10.2,0.645,0.36,1.8,0.053,5.0,14.0,0.9982,3.17,0.42,10.0,6
979,12.2,0.45,0.49,1.4,0.075,3.0,6.0,0.9969,3.13,0.63,10.4,5


In [7]:
tot_train_nan = df_train.isnull().sum()
print(tot_train_nan)

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [8]:
tot_valid_nan = df_valid.isnull().sum()
print(tot_valid_nan)

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [9]:
max_ = df.max(axis=0)
min_ = df.min(axis=0)
df_train = (df_train - min_) / (max_ - min_)
df_valid = (df_valid - min_) / (max_ - min_)

In [10]:
df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1109,0.548673,0.239726,0.43,0.082192,0.265442,0.366197,0.212014,0.596916,0.338583,0.257485,0.369231,0.6
1032,0.309735,0.479452,0.0,0.219178,0.138564,0.056338,0.028269,0.62188,0.488189,0.11976,0.184615,0.4
1002,0.39823,0.116438,0.33,0.078767,0.085142,0.169014,0.074205,0.373715,0.409449,0.305389,0.507692,0.8
487,0.495575,0.359589,0.36,0.061644,0.068447,0.056338,0.028269,0.596916,0.338583,0.053892,0.246154,0.6
979,0.672566,0.226027,0.49,0.034247,0.105175,0.028169,0.0,0.501468,0.307087,0.179641,0.307692,0.4


In [11]:
df_valid.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846,0.4
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385,0.6
7,0.238938,0.363014,0.0,0.020548,0.088481,0.197183,0.053004,0.332599,0.511811,0.083832,0.246154,0.8
11,0.256637,0.260274,0.36,0.356164,0.098497,0.225352,0.339223,0.567548,0.480315,0.281437,0.323077,0.4
23,0.345133,0.253425,0.11,0.09589,0.1202,0.112676,0.215548,0.494126,0.338583,0.11976,0.153846,0.4


In [12]:
df_train_Y = df_train.quality
df_train_X = df_train.drop(['quality'], axis=1)

df_valid_Y = df_valid.quality
df_valid_X = df_valid.drop(['quality'], axis=1)

df_test_Y = dfw.quality
df_test_X = dfw.drop(['quality'], axis=1)

In [13]:
df_train_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1109,0.548673,0.239726,0.43,0.082192,0.265442,0.366197,0.212014,0.596916,0.338583,0.257485,0.369231
1032,0.309735,0.479452,0.0,0.219178,0.138564,0.056338,0.028269,0.62188,0.488189,0.11976,0.184615
1002,0.39823,0.116438,0.33,0.078767,0.085142,0.169014,0.074205,0.373715,0.409449,0.305389,0.507692
487,0.495575,0.359589,0.36,0.061644,0.068447,0.056338,0.028269,0.596916,0.338583,0.053892,0.246154
979,0.672566,0.226027,0.49,0.034247,0.105175,0.028169,0.0,0.501468,0.307087,0.179641,0.307692


In [14]:
df_train_Y.head()

1109    0.6
1032    0.4
1002    0.8
487     0.6
979     0.4
Name: quality, dtype: float64

In [21]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

my_model = DecisionTreeRegressor(max_leaf_nodes=12, random_state=1)

# Fit model
my_model.fit(df_train_X, df_train_Y)

# Predict
preds_val = my_model.predict(df_valid_X)

mae = mean_absolute_error(df_valid_Y, preds_val)
print(mae)

0.10428836441437973


In [22]:
# preds_val.shape
len(df_valid_Y)

480

In [23]:
pred = np.around(preds_val, decimals=1, out=None)

correct_cnt = sum(df_valid_Y == pred)

#print(correct_cnt)
print("Percentage correct: " + str(100*correct_cnt/len(df_valid_X)))
#print(preds_val)

Percentage correct: 41.041666666666664
