In [1]:
%matplotlib inline
# Import required libraries

import os
import time
import sys

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch.utils.data import random_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import urllib
import numpy as np
import matplotlib.pyplot as plt
import glob

In [2]:
# This is a tool I have provided you to help you download your file.

def download_file(url, filename):
    """
    A function that downloads the data file from a URL
    Parameters
    ----------
    url : string
        url where the file to download is located
    filename : string
        location where to save the file
    reporthook : function
        callback to display the download progress
    """
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(url, filename, reporthook)
        
def reporthook(count, block_size, total_size):
    """
    A function that displays the status and speed of the download
    """

    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration + 0.0001))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
                     (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()

In [3]:
# You can download your file by typing your first name into the name block
# The name used is the first part of your first name as listed in BB learn
# If you have problems downloading the data please reach out to me

name = 'Dhruv'
# download_file(f'https://zenodo.org/record/7339649/files/data_{name}.npz?download=1','data.npz')

In [27]:
# Load the data

data = np.load('data.npz')
data_list = data.files
for item in data_list:
    print(f"{item}: {data[item].shape}")

# print(data["training_true"])

training_feat: (100000, 30)
training_true: (100000, 3)
validation_feat: (65536, 30)


In [67]:
## Create tensors from numpy arrays

# Training features
training_feat_np = data[data_list[0]]
scaler = StandardScaler()
training_feat_scaled = scaler.fit(training_feat_np)
training_feat_scaled = scaler.transform(training_feat_np)
X = torch.from_numpy(training_feat_scaled)
print(X.shape)
# Target Values
training_true_np = data[data_list[1]]
y = torch.from_numpy(training_true_np)
print(y.shape)
# Validation Features
validation_feat_np = data[data_list[2]]
X_test = torch.from_numpy(validation_feat_np)

torch.Size([100000, 30])
torch.Size([100000, 3])


In [61]:
print("Range for results:", np.min(training_feat_scaled), np.max(training_feat_scaled))

Range for results: -1.740495746836084 1.7395879139777914


In [6]:
class Data(Dataset):
  '''Dataset Class to store the samples and their corresponding labels, 
  and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
  '''

  def __init__(self, X: np.ndarray, y: np.ndarray, device = 'cpu') -> None:

    # need to convert float64 to float32 else 
    # will get the following error
    # RuntimeError: expected scalar type Double but found Float
    self.X = torch.from_numpy(X.astype(np.float32)).to(device)
    self.y = torch.from_numpy(y.astype(np.float32)).to(device)
    self.len = self.X.shape[0]
  
  def __getitem__(self, index: int) -> tuple:
    return self.X[index], self.y[index]

  def __len__(self) -> int:
    return self.len

In [166]:
# Split dataset into test/train
split_dataset = random_split(training_feat_np, 
                             lengths=[2/3, 1/3], 
                             generator=torch.Generator().manual_seed(42))

train = np.array(split_dataset[0][:])
test = np.array(split_dataset[1])

# Configure dataset
# training_data = Data(X=training_feat_np, y=training_true_np)
# testing_data = Data(X=validation_feat_np, y=training_true_np)

training_data = Data(X=train, y=training_true_np)
testing_data = Data(X=test, y=training_true_np)


30
30


In [175]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=batch_size, shuffle=True)

for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break


Shape of X [N, C, H, W]: torch.Size([64, 30])
Shape of y: torch.Size([64, 3]) torch.float32
Shape of X [N, C, H, W]: torch.Size([64, 30])
Shape of y: torch.Size([64, 3]) torch.float32


In [167]:
# number of features (len of X cols)
input_dim =train.shape[1]
# number of hidden layers set this to 50
hidden_layers = 50
# Add the number of output dimensions
output_dim = 3

In [168]:
class Neural_Network(nn.Module):
  ''' Regression Model
  ''' 

  # note, you can ignore the `:int` and `-> None` this is just more advanced doctring syntax
  def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
      '''The network has 4 layers
            - input layer
            - ReLu
            - hidden layer
            - ReLu
            - hidden layer
            - ReLu
            - output layer
      '''
      super(Neural_Network, self).__init__()
      # in this part you should intantiate each of the layer components
      # Type your code here
      self.flatten = nn.Flatten()
      self.linear_relu_stack = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, output_dim),
      )

  def forward(self, x: torch.Tensor) -> torch.Tensor:
      # In this part you should build a model that returns the 3 outputs of the regression
      # Type your code here
      x = self.linear_relu_stack(x)
      
      return x

In [169]:
# initiate the regression model
# make sure to put it on your GPU
model = Neural_Network(input_dim, hidden_layers, output_dim)
model = model.cpu()
print(model)

# criterion to computes the loss between input and target
# Choose a good criteria

# optimizer that will be used to update weights and biases
# you can choose any optimizer. I would recommend ADAM.
# This problem should not be hard to optimize. A good starting learning rate is 3e-5. 

Neural_Network(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=30, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=50, bias=True)
    (5): ReLU()
    (6): Linear(in_features=50, out_features=3, bias=True)
  )
)


In [170]:
# Hyperparameters
learning_rate = 3e-5
batch_size = 64
epochs = 20

In [171]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [172]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    
    pred_vals = []
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            pred_vals.append(pred.detach().to('cpu').numpy())
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
    )
    return np.concatenate(pred_vals, axis=0)

In [176]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 0.897649  [    0/66667]
loss: 0.147759  [ 6400/66667]
loss: 0.093400  [12800/66667]
loss: 0.077079  [19200/66667]
loss: 0.061563  [25600/66667]
loss: 0.047413  [32000/66667]
loss: 0.052885  [38400/66667]
loss: 0.048684  [44800/66667]
loss: 0.040755  [51200/66667]
loss: 0.038482  [57600/66667]
loss: 0.042539  [64000/66667]
Epoch 2
-------------------------------
loss: 0.035761  [    0/66667]
loss: 0.037223  [ 6400/66667]
loss: 0.040420  [12800/66667]
loss: 0.036941  [19200/66667]
loss: 0.031781  [25600/66667]
loss: 0.037191  [32000/66667]
loss: 0.042082  [38400/66667]
loss: 0.032855  [44800/66667]
loss: 0.029056  [51200/66667]
loss: 0.034608  [57600/66667]
loss: 0.028082  [64000/66667]
Epoch 3
-------------------------------
loss: 0.030303  [    0/66667]
loss: 0.030936  [ 6400/66667]
loss: 0.029152  [12800/66667]
loss: 0.035777  [19200/66667]
loss: 0.028824  [25600/66667]
loss: 0.028411  [32000/66667]
loss: 0.032123  [38400/66667]
loss: 0.03

In [177]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------


RuntimeError: The size of tensor a (64) must match the size of tensor b (3) at non-singleton dimension 1