# Building a Regression Model in PyTorch
- Updated 2023.04.21
- Written by shyeon

## Initialize setting

- Project path 

In [1]:
import os
from pathlib import Path

curr_path = Path().absolute()
os.chdir(curr_path.parent)  # change working directory to parent path

- Packages

In [13]:
import pickle

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms

from src.models.rank import radiorank
from src.utils.graph import build_nx_graph

- Parameters

In [3]:
random_seed = 42


## Data Loader

- Prepare train dataset

In [4]:
with open("data/processed/values.pickle","rb") as f:
    value_df = pickle.load(f)

test_items = value_df.columns.tolist()
item_2_idx = {v: k for k, v in enumerate(test_items)}
idx_2_item = {k: v for k, v in enumerate(test_items)}

train_df, test_df = train_test_split(value_df, test_size=0.5, random_state=random_seed, shuffle=False)
scaler = StandardScaler().fit(train_df)
# scaled_train = pd.DataFrame(scaler.transform(train), columns=train.columns)
# scaled_test = pd.DataFrame(scaler.transform(test), columns=test.columns)
scaled_train_df = pd.DataFrame(scaler.transform(train_df))
scaled_test_df = pd.DataFrame(scaler.transform(test_df))
titles = scaled_train_df.columns.tolist()

scaled_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,-0.811939,0.110734,0.449351,0.476192,0.459512,-0.376119,0.353364,0.045816,-1.366235,1.311278,...,-0.496838,-0.053573,-0.929236,-0.198702,-1.021761,-1.965713,0.201812,-1.113875,-0.478792,-0.832434
1,-0.806638,-0.015902,-0.037221,0.300948,-0.430587,-0.641422,-0.146184,-0.001298,-0.234221,-0.935857,...,-1.44485,0.168023,0.521519,0.409202,-0.801954,0.090127,-0.395164,0.540528,-1.678826,-0.832434
2,-0.589281,-0.323446,-0.454284,-0.075504,-0.863821,-0.614212,-0.266764,0.030111,0.143118,-1.688823,...,-1.874139,0.957459,-0.209662,1.321059,-0.163466,-0.266498,-1.495838,0.143858,-0.429995,-0.832434
3,-0.652898,-0.902351,-0.732325,-0.763502,-1.651519,-1.233252,-1.222795,-0.935732,0.897794,-0.606434,...,0.558496,0.818961,-0.453389,-0.008732,-0.487943,-0.434322,0.882737,-0.514033,-0.502287,-0.832434
4,-0.758925,-0.522444,-0.852389,-0.542823,-1.202531,-0.954344,-1.076376,-1.265533,0.520456,0.052412,...,0.469061,1.192904,-0.163238,0.143244,-1.419508,-0.308454,1.834168,-0.059314,-0.04143,-0.832434


In [5]:
class RfFinalTestDataset(Dataset):
    def __init__(self, df:pd.DataFrame, input_nm:list, label_nm:list) -> None:
        self.df = df
        self.input_nm = input_nm
        self.label_nm = label_nm

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, idx:int) -> tuple[np.array, np.array] :
        inputs = self.df.loc[idx, self.input_nm].values
        labels = self.df.loc[idx, self.label_nm].values
        return inputs, labels

In [6]:
train_dataset = RfFinalTestDataset(scaled_train_df, titles[:10], titles[10:])
test_dataset = RfFinalTestDataset(scaled_test_df, titles[:10], titles[10:])

## Build a prediction model

- Model

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [20]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_num, out_num):
        super(NeuralNetwork, self).__init__()
        gap_by_step = int((in_num - out_num) / 2) # the num of hidden layer + 1
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_num, in_num-gap_by_step),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step, in_num-gap_by_step*2),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step*2, out_num),
            nn.ReLU()
        )
        
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [21]:
train_corr = scaled_train_df.corr()
G = build_nx_graph(train_corr, titles)

selected_nodes = radiorank(G, 0.1, "value")

In [22]:
model = NeuralNetwork(len(selected_nodes[:40]), len(selected_nodes[40:])).to(device)
print(model)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=40, out_features=33, bias=True)
    (1): ReLU()
    (2): Linear(in_features=33, out_features=26, bias=True)
    (3): ReLU()
    (4): Linear(in_features=26, out_features=26, bias=True)
    (5): ReLU()
  )
)


In [24]:
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=True)

In [25]:
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)