<a href="https://colab.research.google.com/github/gvigkannan/Model_ADay/blob/Keras/WineRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [129]:
## Sample Regression Notebook
## Adapted from: https://towardsdatascience.com/pytorch-tabular-regression-428e9c9ac93
## By Vignesh Kannan

## Loading Libraries


In [130]:
## Standard Library:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from time import time
import copy

In [131]:
## Torch Related
import torch, torchvision, torchsummary
import torch.nn as nn 
import torch.optim as optim 
from torch.utils.data import Dataset, DataLoader

In [180]:
## Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [181]:
## Hyperparams!
EPOCHS = 150
BATCH_SIZE = 32
LEARNING_RATE = 0.001


## Reading and Preprocessing

In [182]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv


--2021-04-17 00:24:52--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [application/x-httpd-php]
Saving to: ‘winequality-red.csv.6’


2021-04-17 00:24:52 (780 KB/s) - ‘winequality-red.csv.6’ saved [84199/84199]



In [183]:
df = pd.read_csv("/content/winequality-red.csv", sep = ';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [184]:
X = df.drop(columns = 'quality')
y = df['quality']

## Splitting into train, val, test
trainX, testX, trainY, testY = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)
trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size = 0.2, stratify = trainY, random_state = 42)

## Normalizing Data
scaler = MinMaxScaler()
trainX = scaler.fit_transform(trainX)
valX = scaler.transform(valX)
testX = scaler.transform(testX)

trainX, trainY = np.array(trainX), np.array(trainY).astype(float)
testX, testY = np.array(testX), np.array(testY).astype(float)
valX, valY = np.array(valX), np.array(valY).astype(float)

In [185]:
def get_class_distribution(obj):
    count_dict = {
        "rating_3": 0,
        "rating_4": 0,
        "rating_5": 0,
        "rating_6": 0,
        "rating_7": 0,
        "rating_8": 0,
    }
    
    for i in obj:
        if i == 3: 
            count_dict['rating_3'] += 1
        elif i == 4: 
            count_dict['rating_4'] += 1
        elif i == 5: 
            count_dict['rating_5'] += 1
        elif i == 6: 
            count_dict['rating_6'] += 1
        elif i == 7: 
            count_dict['rating_7'] += 1  
        elif i == 8: 
            count_dict['rating_8'] += 1              
        else:
            print("Check classes.")
            
    return count_dict

In [186]:
## Initializing the dataset
class RegressionDataset(Dataset):

    def __init__(self, dataX, dataY):
      self.dataX = dataX
      self.dataY = dataY

    def __getitem__(self, index):
      return self.dataX[index], self.dataY[index]

    def __len__(self):
      return len(self.dataX)


In [187]:
train_dataset = RegressionDataset(torch.from_numpy(trainX).float(), 
                                  torch.from_numpy(trainY).float())

test_dataset = RegressionDataset(torch.from_numpy(testX).float(), 
                                  torch.from_numpy(testY).float())

val_dataset = RegressionDataset(torch.from_numpy(valX).float(), 
                                  torch.from_numpy(valY).float())



In [188]:
NUM_FEATURES = len(X.columns)

In [189]:
## Initialize Dataloader:
train_dl = DataLoader(dataset = train_dataset, 
                      batch_size = BATCH_SIZE, 
                      shuffle = True)

test_dl = DataLoader(dataset = test_dataset,
                     batch_size = 1)

val_dl = DataLoader(dataset = val_dataset,
                     batch_size = 1)

In [190]:
torch.from_numpy(trainX).float()

tensor([[0.3274, 0.5714, 0.2900,  ..., 0.6121, 0.1801, 0.4769],
        [0.5841, 0.4911, 0.5500,  ..., 0.3707, 0.1988, 0.1692],
        [0.3274, 0.8036, 0.0200,  ..., 0.6379, 0.0621, 0.4000],
        ...,
        [0.4956, 0.2589, 0.4300,  ..., 0.3621, 0.1739, 0.3692],
        [0.5929, 0.4464, 0.6700,  ..., 0.4138, 0.1863, 0.7692],
        [0.2124, 0.3750, 0.0000,  ..., 0.5603, 0.2795, 0.4615]])

In [191]:
next(iter(train_dl))

[tensor([[0.3363, 0.2232, 0.4300, 0.0966, 0.0851, 0.1642, 0.0426, 0.3987, 0.3707,
          0.2609, 0.4308],
         [0.2832, 0.6250, 0.2900, 0.2345, 0.1185, 0.2985, 0.2021, 0.4662, 0.3621,
          0.0870, 0.1538],
         [0.1770, 0.2411, 0.4900, 0.0552, 0.0968, 0.3284, 0.5035, 0.1564, 0.3276,
          0.0683, 0.4769],
         [0.3894, 0.5089, 0.0000, 0.1034, 0.1269, 0.2687, 0.1099, 0.6557, 0.5259,
          0.1304, 0.1385],
         [0.3717, 0.4286, 0.2900, 0.0897, 0.1436, 0.0597, 0.0284, 0.6410, 0.5345,
          0.0621, 0.1077],
         [0.4336, 0.4018, 0.2700, 0.0966, 0.1169, 0.3284, 0.4858, 0.5690, 0.4569,
          0.0994, 0.1538],
         [0.2212, 0.6741, 0.0500, 0.3310, 0.1169, 0.0299, 0.0248, 0.5881, 0.5690,
          0.0807, 0.2769],
         [0.2478, 0.5938, 0.1900, 0.2966, 0.1369, 0.2687, 0.3227, 0.5184, 0.3621,
          0.0807, 0.1846],
         [0.6283, 0.1429, 0.4700, 0.0552, 0.0701, 0.2388, 0.0887, 0.4985, 0.3534,
          0.1739, 0.3385],
         [0.2124, 0

## Model Related

In [192]:
class RegressionModel(nn.Module):
  def __init__(self, num_features):
    super(RegressionModel, self).__init__()

    self.layer1 = nn.Linear(num_features, 16)
    self.layer2 = nn.Linear(16, 32)
    self.layer3 = nn.Linear(32, 16)
    self.out = nn.Linear(16, 1)

    self.relu = nn.ReLU()

  def forward(self, inputs):
    x = self.relu(self.layer1(inputs))
    x = self.relu(self.layer2(x))
    x = self.relu(self.layer3(x))
    x = self.out(x)

    return x

  def predict(self, inputs):
    x = self.relu(self.layer1(inputs))
    x = self.relu(self.layer2(x))
    x = self.relu(self.layer3(x))
    x = self.out(x)     

In [193]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [194]:
model = RegressionModel(NUM_FEATURES).to(device)
print(model)
torchsummary.summary(model, (NUM_FEATURES,))

RegressionModel(
  (layer1): Linear(in_features=11, out_features=16, bias=True)
  (layer2): Linear(in_features=16, out_features=32, bias=True)
  (layer3): Linear(in_features=32, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 16]             192
              ReLU-2                   [-1, 16]               0
            Linear-3                   [-1, 32]             544
              ReLU-4                   [-1, 32]               0
            Linear-5                   [-1, 16]             528
              ReLU-6                   [-1, 16]               0
            Linear-7                    [-1, 1]              17
Total params: 1,281
Trainable params: 1,281
Non-trainable params: 0
----------------------------------------------------------------
Input 

In [195]:
## Training Related
loss_stats = {'train': [], 'val': []}


In [196]:
@torch.no_grad()
def evaluate(model, val_dl, criterion):
  """ 
  Evaluation Step for Regression Model
  Inputs Needed: Model, Validation DataLoader, Loss Criterion
  Output: Loss value and Prediction for Validation Data
  """
  model.eval()
  loss = 0.0
  for X, y in val_dl:
    out = model(X)
    loss += criterion(out, y.unsqueeze(1))
  return loss 

In [197]:
len(train_dl)

32

In [198]:
len(val_dl)

256

In [199]:
def fit(model, criterion, optimizer, train_dl, val_dl, numEpochs):
  """Fit Train Data and evaluate for Validation Data"""
  start_fit = time()
  best_vloss = np.inf
  best_WandB = copy.deepcopy(model.state_dict())

  ## For all epochs:
  for epoch in tqdm(range(numEpochs)):
    print("Epoch: {}".format(epoch+1))
    flag_new = False
    print("*"*15)
    epoch_time = time()
    ## Training
    train_epoch_loss = 0.0
    model.train()

    for X, y in train_dl:
      X, y = X.to(device), y.to(device)
      optimizer.zero_grad() ## Reset the residual gradients
      
      ## Forward
      predY = model(X)
      train_loss = criterion(predY, y.unsqueeze(1))
      ## Unsqueeze gives a column vector!

      train_loss.backward()
      optimizer.step()
      train_epoch_loss += train_loss.item() / len(y)
    
    val_loss = evaluate(model, val_dl, criterion).item() / len(val_dl)
    loss_stats['train'].append(train_epoch_loss)
    loss_stats['val'].append(val_loss)
    ## Model Checkpoint
    if best_vloss > val_loss:
      best_vloss = val_loss
      best_WandB = model.state_dict()
      flag_new = True

    print("Train Loss: {:.4f}\t Valid Loss: {:.4f}\nTime per Epoch: {:.4f}".format(train_epoch_loss, val_loss,  time() - epoch_time))
    print()
    if flag_new:
      print("\nNew Best Val Loss Found!")
  if val_loss > best_vloss:
      model.load_state_dict(best_WandB)
  return model


In [200]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), 
                       lr = LEARNING_RATE)

In [201]:
next(iter(train_dl))

[tensor([[0.3805, 0.3393, 0.2100, 0.0897, 0.1269, 0.2985, 0.1135, 0.5029, 0.5086,
          0.2733, 0.4154],
         [0.3717, 0.2857, 0.4900, 0.1310, 0.1185, 0.2537, 0.3688, 0.5969, 0.4828,
          0.1304, 0.1692],
         [0.4956, 0.1875, 0.4600, 0.0690, 0.1152, 0.0746, 0.0071, 0.4559, 0.3103,
          0.0559, 0.3077],
         [0.1327, 0.1786, 0.2500, 0.0966, 0.0985, 0.3284, 0.1809, 0.4596, 0.5862,
          0.3602, 0.3385],
         [0.2566, 0.2500, 0.1800, 0.0483, 0.1119, 0.3433, 0.1809, 0.4721, 0.5172,
          0.1180, 0.1538],
         [0.2743, 0.5312, 0.0100, 0.0828, 0.0868, 0.4478, 0.1277, 0.2673, 0.5776,
          0.1118, 0.5231],
         [0.5487, 0.1786, 0.4400, 0.0483, 0.0851, 0.2239, 0.1064, 0.6189, 0.4138,
          0.2422, 0.2462],
         [0.3540, 0.4598, 0.6800, 0.0621, 0.6528, 0.2687, 0.1738, 0.4589, 0.2414,
          0.4720, 0.1385],
         [0.3540, 0.3125, 0.2700, 0.0966, 0.0718, 0.1940, 0.0745, 0.3737, 0.3793,
          0.2547, 0.4308],
         [0.4159, 0

In [202]:
model = fit(model, criterion, optimizer, train_dl, val_dl, EPOCHS)

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Epoch: 1
***************
Train Loss: 30.1791	 Valid Loss: 27.6789
Time per Epoch: 0.1579


New Best Val Loss Found!
Epoch: 2
***************
Train Loss: 23.0522	 Valid Loss: 15.6234
Time per Epoch: 0.1482


New Best Val Loss Found!
Epoch: 3
***************
Train Loss: 6.4921	 Valid Loss: 0.8276
Time per Epoch: 0.1358


New Best Val Loss Found!
Epoch: 4
***************
Train Loss: 0.9144	 Valid Loss: 0.7823
Time per Epoch: 0.1405


New Best Val Loss Found!
Epoch: 5
***************
Train Loss: 0.7534	 Valid Loss: 0.7369
Time per Epoch: 0.1259


New Best Val Loss Found!
Epoch: 6
***************
Train Loss: 0.7092	 Valid Loss: 0.7039
Time per Epoch: 0.1528


New Best Val Loss Found!
Epoch: 7
***************
Train Loss: 0.6771	 Valid Loss: 0.6813
Time per Epoch: 0.1215


New Best Val Loss Found!
Epoch: 8
***************
Train Loss: 0.6445	 Valid Loss: 0.6555
Time per Epoch: 0.1406


New Best Val Loss Found!
Epoch: 9
***************
Train Loss: 0.6161	 Valid Loss: 0.6338
Time per Epoch: 0.12

In [209]:
def predict(model, test_dl):
  """ Predict """
  model.eval()
  predY_list = []
  with torch.no_grad():
    for X, y in test_dl:
      X = X.to(device)
      predY = model(X)
      predY_list.append(predY.numpy())

  return [arr.squeeze().tolist() for arr in predY_list]


In [211]:
y_pred = predict(model, test_dl)

In [215]:
print(mean_squared_error(testY, y_pred))
print(r2_score(testY, y_pred))

0.40196190779184987
0.37706732614133076


In [213]:
r2_score?