In [None]:
import pandas as pd
import numpy as np

In [None]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [None]:
categorical_feature = train.select_dtypes(include="object").columns
train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")

In [None]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

In [None]:
# train_test_split을 이용해 X, y 값을 X_train, X_valid, y_train, y_valid 으로 나눠줍니다.
# Hold-out-valiation을 위해 train, valid 세트로 나누기
# test_size=0.1 => 이름은 test지만 train으로 나눠주었기 때문에 valid 사이즈를 지정한 것입니다.
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.05, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
X_test = test

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train[categorical_feature] = oe.fit_transform(X_train[categorical_feature])
X_valid[categorical_feature] = oe.transform(X_valid[categorical_feature])
X_test[categorical_feature] = oe.transform(X_test[categorical_feature])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


# PyTorch 모델을 위한 데이터 변환
X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values.reshape(-1, 1))
X_valid = torch.FloatTensor(X_valid.values)
y_valid = torch.FloatTensor(y_valid.values.reshape(-1, 1))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

* [예제로 배우는 파이토치(PyTorch) — 파이토치 한국어 튜토리얼 (PyTorch tutorials in Korean)](https://tutorials.pytorch.kr/beginner/pytorch_with_examples.html)
* [Linear — PyTorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)

In [None]:
# Multivariate Linear Regression 모델 구현
class MultivariateLinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(MultivariateLinearRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        return x
    

class R2ScoreLoss(nn.Module):
    def __init__(self):
        super(R2ScoreLoss, self).__init__()

    def forward(self, y_pred, y_true):
        residual = y_true - y_pred
        total = y_true - torch.mean(y_true)
        r2_score = 1 - torch.sum(residual**2) / torch.sum(total**2)
        return r2_score
    
    
    
# 모델 인스턴스 생성
model = MultivariateLinearRegression(input_size=X_train.shape[1], output_size=1)

# 손실 함수 및 최적화 알고리즘 설정
criterion = R2ScoreLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# 모델 학습하기
num_epochs = 1000
for epoch in range(num_epochs):
    # 순전파 계산
    y_pred = model(X_train)
    
    # 손실 계산
    loss = criterion(y_pred, y_train)

    # 역전파 계산 및 가중치 갱신
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 로그 출력
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}]')

# 테스트 데이터에 대한 예측 결과 출력
with torch.no_grad():
    y_pred_valid = model(X_valid)
    print(f'Test Loss: {criterion(y_pred_valid, y_valid).item():.4f}')

In [None]:
y_valid.shape, y_pred_valid.shape

In [None]:
from sklearn.metrics import r2_score

r2_score(y_valid, y_pred_valid)