In [1]:
import pandas as pd
import torch


In [2]:
import sys
import os

# Get the root directory (assuming the notebook is inside a subdirectory of the repo)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Adjust this if needed

# Add the project root to sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
red_wine_df = pd.read_csv('../resources/data/wine-quality/winequality-red.csv', delimiter=';')

In [4]:
white_wine_df = pd.read_csv('../resources/data/wine-quality/winequality-white.csv', delimiter=';')

In [5]:
red_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [6]:
white_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [7]:
wine_df = pd.concat([white_wine_df])

In [8]:
wine_arr = wine_df.to_numpy(dtype='float32')

In [9]:
wine_arr.shape

(4898, 12)

In [10]:
wine_t = torch.from_numpy(wine_arr)
wine_t_shuffle = wine_t[torch.randperm(wine_t.shape[0])]
print(wine_t_shuffle[0:3])

tensor([[8.0000e+00, 1.8000e-01, 3.7000e-01, 1.3000e+00, 4.0000e-02, 1.5000e+01,
         9.6000e+01, 9.9120e-01, 3.0600e+00, 6.1000e-01, 1.2100e+01, 6.0000e+00],
        [7.1000e+00, 2.1000e-01, 7.2000e-01, 1.6000e+00, 1.6700e-01, 6.5000e+01,
         1.2000e+02, 9.9324e-01, 2.9700e+00, 5.1000e-01, 9.2000e+00, 5.0000e+00],
        [7.0000e+00, 1.6000e-01, 7.3000e-01, 1.0000e+00, 1.3800e-01, 5.8000e+01,
         1.5000e+02, 9.9360e-01, 3.0800e+00, 3.0000e-01, 9.2000e+00, 5.0000e+00]])


In [11]:
X_origin, t_origin = wine_t_shuffle[:, :-1], wine_t_shuffle[:, -1].long()
X_origin.shape, t_origin.shape

(torch.Size([4898, 11]), torch.Size([4898]))

In [12]:
t_oneshot = torch.zeros(t_origin.shape[0], 10)
t_oneshot.scatter_(1, t_origin.unsqueeze(1), 1.0)
t_oneshot[0:3]

tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [13]:
t_oneshot.shape

torch.Size([4898, 10])

In [14]:
val_size = 200
X_train_origin, X_test_origin = X_origin[:-val_size, :], X_origin[-val_size:, :]
t_train, t_test = t_oneshot[:-val_size, :], t_oneshot[-val_size:, :]

X_mean = torch.mean(X_train_origin, dim=0)
X_var = torch.var(X_train_origin, dim=0)

X_train = (X_train_origin - X_mean) / torch.sqrt(X_var)
X_test = (X_test_origin - X_mean) / torch.sqrt(X_var)
X_train.shape, X_test.shape, t_train.shape, t_test.shape

(torch.Size([4698, 11]),
 torch.Size([200, 11]),
 torch.Size([4698, 10]),
 torch.Size([200, 10]))

In [15]:
from src.wine_quality.dataset import WineDataset
training_dataset = WineDataset(X_train, t_train)
validation_dataset = WineDataset(X_test, t_test)

In [16]:
training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=100, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=100, shuffle=False)

In [17]:
from src.wine_quality.models import WineModel
model = WineModel()

In [18]:
loss_func = torch.nn.CrossEntropyLoss()

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [20]:
def train_one_epoch(epoch, training_dataloader, model, loss_func, optimizer):
    training_loss = 0

    for i, data in enumerate(training_dataloader):
        X_batch, t_batch = data
    
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_func(outputs, t_batch)
    
        loss.backward()
    
        optimizer.step()
    
        training_loss += loss.item()

        # if i % 50 == 0:
        #     print('EPOCH {}:'.format(epoch + 1), f'LOSS train {training_loss / (i+1)}')


    training_loss_per_batch = training_loss / (i+1)
    return training_loss_per_batch
    

In [21]:
epoch_number = 0

EPOCHS = 600


for epoch in range(EPOCHS):
    epoch_number = epoch + 1
    

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch, training_loader, model, loss_func, optimizer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(validation_loader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = loss_func(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)


    if epoch_number % 10 == 0:
        print('EPOCH {}:'.format(epoch_number), 'LOSS train {} valid {}'.format(avg_loss, avg_vloss))


    

EPOCH 10: LOSS train 1.2192184227578184 valid 1.1925630569458008
EPOCH 20: LOSS train 1.155239536407146 valid 1.1396484375
EPOCH 30: LOSS train 1.1197881635199203 valid 1.1259269714355469
EPOCH 40: LOSS train 1.104311835258565 valid 1.1199829578399658
EPOCH 50: LOSS train 1.1048440869818343 valid 1.111734390258789
EPOCH 60: LOSS train 1.0921297872320135 valid 1.1063809394836426
EPOCH 70: LOSS train 1.0943889326237617 valid 1.0977857112884521
EPOCH 80: LOSS train 1.0890324065025816 valid 1.0969563722610474
EPOCH 90: LOSS train 1.0848186497992658 valid 1.0889835357666016
EPOCH 100: LOSS train 1.0796928722807702 valid 1.0896824598312378
EPOCH 110: LOSS train 1.0748795740147854 valid 1.0835062265396118
EPOCH 120: LOSS train 1.0743773135733097 valid 1.0829788446426392
EPOCH 130: LOSS train 1.0683922729593642 valid 1.075941562652588
EPOCH 140: LOSS train 1.0708321447068072 valid 1.075361728668213
EPOCH 150: LOSS train 1.0600633240760642 valid 1.0750136375427246
EPOCH 160: LOSS train 1.066622