# Dataloader and stochastic gradient descent

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import torch
from torch import nn 
import torch.nn.functional as F
from torch import utils

torch.manual_seed(0)
np.random.seed(0)

- ## 1. Data

In [3]:
data_np = pd.read_csv('data/Advertising.csv', index_col=0)
data_np.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
data_np.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [5]:
# extract features and responses
X = data_np.iloc[:, :-1].values
y = data_np['Sales'].values

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# normalization 
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# we may further split the train set into a training set and val set 
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.33,
                                                  random_state=0) # 0.33 x 0.75 = 0.25


# convert to torch object
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().view(-1,1) 
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).float().view(-1,1) 
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().view(-1,1) 

In [6]:
# print out the size of train, val and test set
print('train size:', X_train.shape)
print('val size:', X_val.shape)
print('test size:', X_test.shape)

train size: torch.Size([100, 3])
val size: torch.Size([50, 3])
test size: torch.Size([50, 3])


- ## 2. Dataloader

In [7]:
# dataset in pytorch
X_train_data = utils.data.TensorDataset(X_train, y_train)

In [8]:
# pass the torch dataset to Dataloader (an iterable object)

# batch_size: how many samples to load each time
# shuffle: whether to reshuffle the dataset every epoch

# we use mini-batch GD, which only requires b sample to compute the gradient. Here we set b = 8
# if b = 1, then we have SGD. Usually, we set b = 2, 4, 8, 16, 32, 64, ... for efficiency

train_loader = utils.data.DataLoader(X_train_data, batch_size = 8, shuffle=True) 

In [9]:
# check every element of this loader contains 8 samples
list(train_loader)[0]

[tensor([[0.0530, 0.4052, 0.1610],
         [0.5915, 0.1875, 0.0550],
         [0.0801, 0.0323, 0.1980],
         [0.7585, 0.1653, 0.5560],
         [0.6486, 0.7137, 0.7470],
         [0.6398, 0.5786, 0.1730],
         [0.4345, 0.8629, 0.2800],
         [0.5446, 0.6371, 0.5200]]),
 tensor([[ 7.6000],
         [12.8000],
         [ 6.9000],
         [13.4000],
         [19.2000],
         [17.3000],
         [18.0000],
         [16.9000]])]

In [11]:
print(len(train_loader))

13


- ## 3. Build & Train Model

In [12]:
# ====== Construct a model class ========= #
class myLinearRegression(torch.nn.Module):    
    def __init__(self, inputSize):        
        super().__init__()     
        self.Linear = nn.Linear(inputSize, 1) 
    def forward(self, x):        
        out = self.Linear(x) 
        return out

model = myLinearRegression(inputSize = X.shape[1]) # initalize the model

# ====== Loss and optimizer =========
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) 

# ====== Training =========
epochs = 100
for i in range(epochs):
    # since train_loader is an iterable object, we can use for loop. 
    # Note x_train, y_train contains only 8 sample as per batch_size = 8
    for (x_train, y_train) in train_loader:
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # calulate output and loss 
        output = model(x_train)
        loss = loss_fn(output, y_train)

        # backprop and take a step
        loss.backward()
        optimizer.step()

    if i % 10 == 0:
        # Note we need to deactivate training (not compute gradient) and move to validation phase
        model.eval()
        with torch.no_grad():
            output_val = model(X_val)
            loss_val = loss_fn(output_val, y_val)
        model.train() # after you predict on val set, you need to set back to training mode

        print('Epoch {}: {:.4f} (Train) {:.4f} (Val)'.format(i, loss, loss_val))

Epoch 0: 103.2291 (Train) 141.0782 (Val)
Epoch 10: 20.0245 (Train) 7.3877 (Val)
Epoch 20: 2.1095 (Train) 5.6105 (Val)
Epoch 30: 1.8203 (Train) 4.3243 (Val)
Epoch 40: 9.5302 (Train) 3.5664 (Val)
Epoch 50: 1.9123 (Train) 3.1039 (Val)
Epoch 60: 3.3097 (Train) 2.9038 (Val)
Epoch 70: 0.5600 (Train) 2.8171 (Val)
Epoch 80: 0.8859 (Train) 2.7851 (Val)
Epoch 90: 2.3696 (Train) 2.7917 (Val)


- ## 4. Evaluate on test data

In [13]:
# we can predict test data similarly as before
y_pred = model(X_test)
test_mse = loss_fn(y_pred, y_test)
print(test_mse)

tensor(3.9157, grad_fn=<MseLossBackward0>)


In [14]:
# a more safer way is to set model to evaluation phase similarly for validation
model.eval()
with torch.no_grad():
    y_pred = model(X_test)
    test_mse = loss_fn(y_pred, y_test)
print(test_mse)

tensor(3.9157)


***

# Overfitting and remedies
### - Dropout
### - Regularization
### - Early stopping

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import torch
from torch import nn 
import torch.nn.functional as F
from torch.utils import data

torch.manual_seed(0)
np.random.seed(0)

In [16]:
# load dataset for binary classification
data = pd.read_csv('data/pima-indians-diabetes.csv', header= None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       768 non-null    int64  
 1   1       768 non-null    int64  
 2   2       768 non-null    int64  
 3   3       768 non-null    int64  
 4   4       768 non-null    int64  
 5   5       768 non-null    float64
 6   6       768 non-null    float64
 7   7       768 non-null    int64  
 8   8       768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [18]:
# split data into X and y
X = data.iloc[:,0:8].values
y = data.iloc[:,8].values

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# normalization 
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# we may further split the train set into a training set and val set 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=0) # 0.33 x 0.75 = 0.25

# convert to torch object
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).long()
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).long()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).long()

In [19]:
# print out the size of train, val and test set
print('train size:', X_train.shape)
print('val size:', X_val.shape)
print('test size:', X_test.shape)

train size: torch.Size([385, 8])
val size: torch.Size([191, 8])
test size: torch.Size([192, 8])


In [23]:
class NNClassifier(torch.nn.Module):
    def __init__(self, inputSize, hiddenSize):
        super().__init__() 
        
        self.Linear1 = nn.Linear(inputSize, hiddenSize)
        self.Linear2 = nn.Linear(hiddenSize, hiddenSize)
        self.Linear3 = nn.Linear(hiddenSize, 1)
        
        self.act1 = nn.ReLU()
        self.act2 = nn.ReLU()
        
        # add two dropout layers with probability p = 0.2
        self.drop1 = nn.Dropout(p=0.2)
        self.drop2 = nn.Dropout(p=0.2)
        
    def forward(self, x):        
        # first layer (linear + act + dropout)
        x = self.Linear1(x)
        x = self.act1(x)
        x = self.drop1(x)
        
        # second layer (linear + act + dropout)
        x = self.Linear2(x)
        x = self.act2(x)
        x = self.drop2(x)
        
        # output layer (no activation)
        x = self.Linear3(x)
        
        return x
    
# init model
model = NNClassifier(X_train.shape[1], 512) # initalize the model

# loss function 
loss_fn = nn.BCEWithLogitsLoss()  

# optimizer (with weight regularization = 0.01)
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.01)

In [24]:
def compute_acc(output, y):
    """
        output: neural network output with size [n,1]
        y: true target values with size [n,]
    """
    pred = (torch.sigmoid(output.squeeze()) > 0.5).long()
    acc = (pred == y).float().sum()/y.shape[0]
    return acc

In [25]:
# training starts
epochs = 2000
best_acc = 0
for i in range(epochs):
    
    # zero the parameter gradients
    optimizer.zero_grad()

    # calulate output and loss 
    output = model(X_train)
    loss = loss_fn(output, y_train.float().view(-1,1))

    # backprop and take a step
    loss.backward()
    optimizer.step()
    
    if i % 100 == 0:
        model.eval()
        with torch.no_grad():
            output_val = model(X_val)
            acc_val = compute_acc(output_val, y_val)
            if acc_val >= best_acc:
                best_acc = acc_val
                torch.save(model, 'materials/saved_best_model_w5.pth')
                print('Saving model...')
        model.train() 

        print('Epoch {}: {:.4f} (Train loss) {:.4f} (Val acc)'.format(i, loss, acc_val))

Saving model...
Epoch 0: 0.6877 (Train loss) 0.6021 (Val acc)
Saving model...
Epoch 100: 0.5073 (Train loss) 0.7539 (Val acc)
Saving model...
Epoch 200: 0.4898 (Train loss) 0.7592 (Val acc)
Epoch 300: 0.4850 (Train loss) 0.7539 (Val acc)
Saving model...
Epoch 400: 0.4846 (Train loss) 0.7696 (Val acc)
Epoch 500: 0.4810 (Train loss) 0.7539 (Val acc)
Epoch 600: 0.4815 (Train loss) 0.7382 (Val acc)
Epoch 700: 0.4780 (Train loss) 0.7539 (Val acc)
Epoch 800: 0.4808 (Train loss) 0.7539 (Val acc)
Epoch 900: 0.4844 (Train loss) 0.7435 (Val acc)
Epoch 1000: 0.4774 (Train loss) 0.7592 (Val acc)
Epoch 1100: 0.4816 (Train loss) 0.7487 (Val acc)
Epoch 1200: 0.4809 (Train loss) 0.7435 (Val acc)
Epoch 1300: 0.4810 (Train loss) 0.7435 (Val acc)
Epoch 1400: 0.4822 (Train loss) 0.7435 (Val acc)
Epoch 1500: 0.4851 (Train loss) 0.7539 (Val acc)
Epoch 1600: 0.4840 (Train loss) 0.7487 (Val acc)
Epoch 1700: 0.4786 (Train loss) 0.7592 (Val acc)
Epoch 1800: 0.4835 (Train loss) 0.7592 (Val acc)
Epoch 1900: 0.482

In [26]:
# load model
model = torch.load('materials/saved_best_model_w5.pth')

# evaluate test performance
model.eval()
with torch.no_grad():
    output_test = model(X_test)
    acc_test = compute_acc(output_test, y_test)
    
acc_test

tensor(0.7812)