# Building a Regression Model in PyTorch
- Updated 2023.04.21
- Written by shyeon

## Initialize setting

- Project path 

In [1]:
import os
from pathlib import Path

curr_path = Path().absolute()
os.chdir(curr_path.parent)  # change working directory to parent path

- Packages

In [2]:
import pickle

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms

from src.models.rank import radiorank
from src.utils.graph import build_nx_graph

- Parameters

In [3]:
random_seed = 42


## Data Loader

- Prepare train dataset

In [4]:
value_df = pd.read_pickle("data/processed/values.pickle")

test_items = value_df.columns.tolist()
item_2_idx = {v: k for k, v in enumerate(test_items)}
idx_2_item = {k: v for k, v in enumerate(test_items)}

train_df, test_df = train_test_split(value_df, test_size=0.5, random_state=random_seed, shuffle=False)
scaler = StandardScaler().fit(train_df)
# scaled_train = pd.DataFrame(scaler.transform(train), columns=train.columns)
# scaled_test = pd.DataFrame(scaler.transform(test), columns=test.columns)
scaled_train_df = pd.DataFrame(scaler.transform(train_df))
scaled_test_df = pd.DataFrame(scaler.transform(test_df))
titles = scaled_train_df.columns.tolist()

scaled_train_df.head()

ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'

In [7]:
class RfFinalTestDataset(Dataset):
    def __init__(self, df:pd.DataFrame, input_nm:list, label_nm:list) -> None:
        self.df = df
        self.input_nm = input_nm
        self.label_nm = label_nm

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, idx:int) -> tuple[np.array, np.array] :
        inputs = self.df.loc[idx, self.input_nm].values
        labels = self.df.loc[idx, self.label_nm].values
        return inputs, labels

In [8]:
train_dataset = RfFinalTestDataset(scaled_train_df, titles[:10], titles[10:])
test_dataset = RfFinalTestDataset(scaled_test_df, titles[:10], titles[10:])

NameError: name 'scaled_train_df' is not defined

## Build a prediction model

- Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_num, out_num):
        super(NeuralNetwork, self).__init__()
        gap_by_step = int((in_num - out_num) / 2) # the num of hidden layer + 1
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_num, in_num-gap_by_step),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step, in_num-gap_by_step*2),
            nn.ReLU(),
            nn.Linear(in_num-gap_by_step*2, out_num),
            nn.ReLU()
        )
        
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
train_corr = scaled_train_df.corr()
G = build_nx_graph(train_corr, titles)

selected_nodes = radiorank(G, 0.1, "value")

In [None]:
model = NeuralNetwork(len(selected_nodes[:40]), len(selected_nodes[40:])).to(device)
print(model)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=True)

In [None]:
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)