# Building a Regression Model in PyTorch
- Updated 2023.04.21
- Written by shyeon

## Initialize setting

- Project path 

In [1]:
import os
from pathlib import Path

curr_path = Path().absolute()
os.chdir(curr_path.parent)  # change working directory to parent path

- Packages

In [2]:
import pickle

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm import tqdm

from src.models.rank import radiorank
from src.utils.graph import build_nx_graph


- Parameters

In [3]:
random_seed = 42


## Data Loader

- Prepare train dataset

In [4]:
value_df = pd.read_pickle("data/processed/values.pickle")

test_items = value_df.columns.tolist()
item_2_idx = {v: k for k, v in enumerate(test_items)}
idx_2_item = {k: v for k, v in enumerate(test_items)}

train_df, test_df = train_test_split(value_df, test_size=0.5, random_state=random_seed, shuffle=False)
scaler = StandardScaler().fit(train_df)
# scaled_train = pd.DataFrame(scaler.transform(train), columns=train.columns)
# scaled_test = pd.DataFrame(scaler.transform(test), columns=test.columns)
scaled_train_df = pd.DataFrame(scaler.transform(train_df))
scaled_test_df = pd.DataFrame(scaler.transform(test_df))
titles = scaled_train_df.columns.tolist()

In [5]:
class RfFinalTestDataset(Dataset):
    def __init__(self, df:pd.DataFrame, input_nm:list, label_nm:list) -> None:
        self.df = df
        self.input_nm = input_nm
        self.label_nm = label_nm

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, idx:int) -> tuple[np.array, np.array] :
        inputs = self.df.loc[idx, self.input_nm].values
        labels = self.df.loc[idx, self.label_nm].values
        return inputs, labels

## Build a prediction model

- Model

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_num, out_num):
        super(NeuralNetwork, self).__init__()
        gap_by_step = int((in_num - out_num) / 2) # the num of hidden layer + 1
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_num, in_num-gap_by_step),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step, in_num-gap_by_step*2),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step*2, out_num),
            nn.ReLU()
        )
        
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [8]:
train_corr = scaled_train_df.corr()
G = build_nx_graph(train_corr, titles)

selected_nodes = radiorank(G, 0.1, "value")

In [9]:
train_dataset = RfFinalTestDataset(scaled_train_df, selected_nodes[:40], selected_nodes[40:])
test_dataset = RfFinalTestDataset(scaled_test_df, selected_nodes[:40], selected_nodes[40:])

model = NeuralNetwork(len(selected_nodes[:40]), len(selected_nodes[40:]))
model.to(device)
print(model)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=40, out_features=33, bias=True)
    (1): ReLU()
    (2): Linear(in_features=33, out_features=26, bias=True)
    (3): ReLU()
    (4): Linear(in_features=26, out_features=26, bias=True)
    (5): ReLU()
  )
)


In [10]:
epochs = 100
batch_size = 1000
learning_rate = 1e-3

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
next(train_dataloader)

TypeError: 'DataLoader' object is not an iterator

In [13]:
loss_stats = {
    "train": [],
    "val": []
}

In [16]:
print("Begin training.")
with tqdm(range(epochs), unit="batch", mininterval=0, disable=True) as bar:    
    for epoch in bar:
        bar.set_description(f"Epoch {epoch}")        
        # TRAINING
        train_epoch_loss = 0
        model.train()
        for X_train_batch, y_train_batch in train_dataloader:
            print(X_train_batch.shape, y_train_batch.shape)
            X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
            optimizer.zero_grad()
            y_train_pred = model(X_train_batch)
            train_loss = loss_fn(y_train_pred, y_train_batch.unsqueeze(1))
            train_loss.backward()
            optimizer.step()
            train_epoch_loss += train_loss.item()
            
        # VALIDATION    
        with torch.no_grad():
            val_epoch_loss = 0
            model.eval()
            for X_val_batch, y_val_batch in test_dataloader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
                y_val_pred = model(X_val_batch)
                val_loss = loss_fn(y_val_pred, y_val_batch.unsqueeze(1))
                
                val_epoch_loss += val_loss.item()
        loss_stats['train'].append(train_epoch_loss/len(train_dataloader))
        loss_stats['val'].append(val_epoch_loss/len(test_dataloader))                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_dataloader):.5f} | Val Loss: {val_epoch_loss/len(test_dataloader):.5f}')

Begin training.
torch.Size([1000, 40]) torch.Size([1000, 26])


RuntimeError: mat1 and mat2 must have the same dtype