In [1]:
import pandas as pd
import numpy as np

In [2]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [3]:
categorical_feature = train.select_dtypes(include="object").columns
train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")

In [4]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

((4209, 376), (4209,))

In [5]:
# train_test_split을 이용해 X, y 값을 X_train, X_valid, y_train, y_valid 으로 나눠줍니다.
# Hold-out-valiation을 위해 train, valid 세트로 나누기
# test_size=0.1 => 이름은 test지만 train으로 나눠주었기 때문에 valid 사이즈를 지정한 것입니다.
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.05, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((3998, 376), (211, 376), (3998,), (211,))

In [6]:
X_test = test

In [7]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train[categorical_feature] = oe.fit_transform(X_train[categorical_feature])
X_valid[categorical_feature] = oe.transform(X_valid[categorical_feature])
X_test[categorical_feature] = oe.transform(X_test[categorical_feature])

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim


# PyTorch 모델을 위한 데이터 변환
X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values.reshape(-1, 1))
X_valid = torch.FloatTensor(X_valid.values)
y_valid = torch.FloatTensor(y_valid.values.reshape(-1, 1))
X_test = torch.FloatTensor(X_test.values)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

(torch.Size([3998, 376]),
 torch.Size([3998, 1]),
 torch.Size([211, 376]),
 torch.Size([211, 1]))

* [예제로 배우는 파이토치(PyTorch) — 파이토치 한국어 튜토리얼 (PyTorch tutorials in Korean)](https://tutorials.pytorch.kr/beginner/pytorch_with_examples.html)
* [Linear — PyTorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)

In [9]:
# Multivariate Linear Regression 모델 구현
class MultivariateLinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(MultivariateLinearRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x
    

class R2ScoreLoss(nn.Module):
    def __init__(self):
        super(R2ScoreLoss, self).__init__()

    def forward(self, y_pred, y_true):
        residual = y_true - y_pred
        total = y_true - torch.mean(y_true)
        r2_score = 1 - torch.sum(residual**2) / torch.sum(total**2)
        return r2_score
    
    
    
# 모델 인스턴스 생성
model = MultivariateLinearRegression(input_size=X_train.shape[1], output_size=1)

# 손실 함수 및 최적화 알고리즘 설정
# criterion = R2ScoreLoss()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 모델 학습하기
num_epochs = 2000
for epoch in range(num_epochs):
    # 순전파 계산
    y_pred = model(X_train)
    
    # 손실 계산
    loss = criterion(y_pred, y_train)

    # 역전파 계산 및 가중치 갱신
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 로그 출력
    if (epoch+1) % 200 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}]')

# 테스트 데이터에 대한 예측 결과 출력
with torch.no_grad():
    y_pred_valid = model(X_valid)
    print(f'Test Loss: {criterion(y_pred_valid, y_valid).item():.4f}')

Epoch [200/2000], Loss: 112.0380]
Epoch [400/2000], Loss: 71.8323]
Epoch [600/2000], Loss: 66.4955]
Epoch [800/2000], Loss: 63.3859]
Epoch [1000/2000], Loss: 61.3936]
Epoch [1200/2000], Loss: 59.9712]
Epoch [1400/2000], Loss: 58.7246]
Epoch [1600/2000], Loss: 57.4232]
Epoch [1800/2000], Loss: 55.8380]
Epoch [2000/2000], Loss: 54.4302]
Test Loss: 65.2284


In [10]:
y_valid.shape, y_pred_valid.shape

(torch.Size([211, 1]), torch.Size([211, 1]))

In [11]:
from sklearn.metrics import r2_score

valid_score = r2_score(y_valid, y_pred_valid)
valid_score

0.5395774971989298

In [12]:
with torch.no_grad():
    outputs = model(X_test)
    y_predict = outputs.squeeze().numpy()
    
y_predict[:5]

array([ 95.46068 , 113.63221 ,  94.816986,  73.68028 , 112.739365],
      dtype=float32)

https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/submissions

In [13]:
submission["y"] = y_predict

In [14]:
file_name = f"submit_pytorch_{valid_score:.5f}.csv"
file_name

'submit_pytorch_0.53958.csv'

In [15]:
submission.to_csv(file_name)
pd.read_csv(file_name, index_col="ID").head(2)

Unnamed: 0_level_0,y
ID,Unnamed: 1_level_1
1,95.46068
2,113.63221
