### Using *raw* data
First I am importing necessary libraries, then importing data, removing the arbitrary index "Unnamed: 0" column, and viewing the first few observations for each.

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
train = pd.read_csv('data/red_wine_train.csv')
train.drop('Unnamed: 0',axis=1, inplace=True)
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality,alcohol_higher,va_high
0,7.0,0.685,0.0,0.067,63.0,0.9979,0.81,9.9,5,0,1
1,8.6,0.685,0.1,0.092,12.0,0.99745,0.65,9.55,6,0,1
2,5.6,0.66,0.0,0.087,11.0,0.99378,0.63,12.8,7,1,1
3,7.7,0.51,0.28,0.087,54.0,0.998,0.74,9.2,5,0,0
4,8.7,0.31,0.46,0.059,25.0,0.9966,0.76,10.1,6,0,0


In [5]:
test = pd.read_csv('data/red_wine_test.csv')
test.drop('Unnamed: 0',axis=1, inplace=True)
test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality,alcohol_higher,va_high
0,12.6,0.31,0.72,0.072,29.0,0.9987,0.82,9.8,8,0,0
1,11.8,0.33,0.49,0.093,80.0,1.0002,0.76,10.7,7,0,0
2,7.1,0.875,0.05,0.082,14.0,0.99808,0.52,10.2,3,0,1
3,9.0,0.8,0.12,0.083,28.0,0.99836,0.65,10.4,6,0,1
4,7.9,0.69,0.21,0.08,141.0,0.9962,0.51,9.9,5,0,1


Next I am spliting the train and test sets each into dataframes containing only attributes and respective pairs containing only the quality column (the class label).

In [6]:
trainAttr = train.drop('quality',axis=1) #training df without class and index column #x_train

testAttr = test.drop('quality',axis=1) #testing df without class and index column #x_val

trainClass = train['quality'] #training df only class column #y_train

testClass = test['quality'] #testing df only class column #y_val

Here I am just creating a variable for the number of columns which will be used as input to build the model. Then I build a single-layer model and output the summary. Then I use the .compile method to process the optimizer, loss, and metric types used in the model.

In [7]:
n_inputs = [trainAttr.shape[1]] #10 columns = 10 input nodes

In [8]:
model = tf.keras.Sequential([tf.keras.layers.Dense(units=1,input_shape=n_inputs)]) #build model with one layer
 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 11        
                                                                 
Total params: 11
Trainable params: 11
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(optimizer='adam',
              loss='mae',
              metrics=['accuracy'])

Now I am fitting the model.

In [12]:
model.fit(trainAttr, trainClass, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x195f5ca8d30>

In [14]:
model.fit(trainAttr, trainClass,
 
                   validation_data=(testAttr, testClass),
                    
                   batch_size=1000,
                   epochs=5,  # total epoch
 
                   )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x195f71a80a0>

Now to oversample (to take care of imbalanced data) we will find the max class value and sample with replacement the other class observations. So we divide the train df into dfs of each class value, and then sample those observations

In [22]:
qual3 = (train[train["quality"]==3])
qual4 = (train[train["quality"]==4]) 
qual5 = (train[train["quality"]==5]) 
qual6 = (train[train["quality"]==6])
qual7 = (train[train["quality"]==7])
qual8 = (train[train["quality"]==8])

#number of samples per class label to determine inbalance
n_qual3 = len(qual3) #8
n_qual4 = len(qual4) #42
n_qual5 = len(qual5) #545
n_qual6 = len(qual6) #510
n_qual7 = len(qual7) #159
n_qual8 = len(qual8) #15

n_max = max(n_qual3, n_qual4, n_qual5, n_qual6, n_qual7, n_qual8) #545


545

In [24]:
#oversample so that each class has the same number of observations, equal to the number of observations of quality 5
qual3 = qual3.sample(n_max, replace=True)
qual4 = qual4.sample(n_max, replace=True)
qual6 = qual6.sample(n_max, replace=True)
qual7 = qual7.sample(n_max, replace=True)
qual8 = qual8.sample(n_max, replace=True)

In [27]:
train_oversampled = pd.concat([qual3, qual4, qual5, qual6, qual7, qual8])
train_oversampled

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality,alcohol_higher,va_high
1006,10.4,0.44,0.42,0.145,48.0,0.99832,0.86,9.90,3,0,0
660,11.6,0.58,0.66,0.074,47.0,1.00080,0.57,9.00,3,0,0
432,7.3,0.98,0.05,0.061,49.0,0.99705,0.55,9.70,3,0,1
480,6.7,0.76,0.02,0.078,12.0,0.99600,0.63,9.95,3,0,1
1217,10.4,0.61,0.49,0.200,16.0,0.99940,0.63,8.40,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...
931,11.3,0.62,0.67,0.086,19.0,0.99880,0.69,13.40,8,1,1
327,10.7,0.35,0.53,0.070,16.0,0.99720,0.65,11.00,8,0,0
521,9.4,0.30,0.56,0.080,17.0,0.99640,0.92,11.70,8,1,0
327,10.7,0.35,0.53,0.070,16.0,0.99720,0.65,11.00,8,0,0


In [32]:
#copied from above ln[6] changing reg to oversampled
trainAttr = train_oversampled.drop('quality',axis=1) #training df without class and index column #x_train

trainClass = train_oversampled['quality'] #training df only class column #y_train

# trainClassAttr = trainAttr.map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5})
# testClassAttr = testClassAttr.map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5})

# n_inputs = [trainAttr.shape[1]] #10 columns = 10 input nodes

# model = tf.keras.Sequential([tf.keras.layers.Dense(units=1,input_shape=n_inputs)]) #build model with one layer
 
# model.summary()

# model.compile(optimizer='adam',
#               loss='mae',
#               metrics=['accuracy'])
# model.fit(trainAttr, trainClass, epochs=5)
modelOversampled = tf.keras.Sequential([
 
    tf.keras.layers.Dense(units=64, activation='relu',
                          input_shape=n_inputs),
    tf.keras.layers.Dense(units=6, activation='softmax')
])

modelOversampled.summary()

modelOversampled.compile(optimizer='adam',
              loss='mae',
              metrics=['accuracy'])
modelOversampled.fit(trainAttr, trainClass, epochs=5)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                704       
                                                                 
 dense_7 (Dense)             (None, 6)                 390       
                                                                 
Total params: 1,094
Trainable params: 1,094
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x195f850f1c0>