In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

sns.set(style="whitegrid")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train.csv", index_col="PassengerId")
test = pd.read_csv("data/test.csv", index_col="PassengerId")
train.shape, test.shape

((891, 11), (418, 10))

In [3]:
train.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## data split

In [4]:
label_name = "Survived"

In [5]:
from sklearn.model_selection import train_test_split

X_train_raw, X_valid_raw, y_train_raw, y_valid_raw = train_test_split(
    train.drop(label_name, axis=1), train[label_name], test_size=0.33, random_state=42)

In [6]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore')
X_train = ohe.fit_transform(X_train_raw).toarray()
X_valid = ohe.transform(X_valid_raw).toarray()
X_test = ohe.transform(test).toarray()
# Tensor 변환
X_train = torch.Tensor(X_train)
X_valid = torch.Tensor(X_valid)
X_test = torch.Tensor(X_test)

X_train.shape

torch.Size([596, 1484])

In [7]:
# Label 1차원 => 2차원으로 만들기
y_train = torch.Tensor(y_train_raw.values).unsqueeze(-1)
y_valid = torch.Tensor(y_valid_raw.values).unsqueeze(-1)
print(y_train.shape, y_valid.shape)
y_train[:5]

torch.Size([596, 1]) torch.Size([295, 1])


tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [8]:
y_train.shape

torch.Size([596, 1])

## model

In [9]:
# 모델 정의
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

# 모델 초기화
input_size = X_train.shape[1]
model = LogisticRegression(input_size)

# 손실 함수 및 optimizer 설정
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

optimizer에 등록된 모든 매개변수의 gradient를 0으로 초기화하는 메서드입니다. 이를 호출하지 않으면, backward() 함수 호출 시 이전에 계산된 gradient 값과 현재 gradient 값이 누적되어 학습이 제대로 이루어지지 않을 수 있습니다.

따라서 모델의 학습을 시작하기 전에, optimizer.zero_grad()를 호출하여 gradient 값을 초기화해야 합니다. 예를 들어, 다음과 같은 코드에서는 각 학습 루프(iteration)마다 optimizer.zero_grad()를 호출하여 gradient를 초기화합니다.

## train

In [11]:
# 학습
num_epochs = 1000
for epoch in range(num_epochs):
    # forward + backward + optimize
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # 중간 결과 출력
    if (epoch+1) % 50 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Epoch [50/1000], Loss: 0.6376
Epoch [100/1000], Loss: 0.6096
Epoch [150/1000], Loss: 0.5911
Epoch [200/1000], Loss: 0.5768
Epoch [250/1000], Loss: 0.5646
Epoch [300/1000], Loss: 0.5540
Epoch [350/1000], Loss: 0.5444
Epoch [400/1000], Loss: 0.5359
Epoch [450/1000], Loss: 0.5281
Epoch [500/1000], Loss: 0.5211
Epoch [550/1000], Loss: 0.5147
Epoch [600/1000], Loss: 0.5089
Epoch [650/1000], Loss: 0.5036
Epoch [700/1000], Loss: 0.4987
Epoch [750/1000], Loss: 0.4942
Epoch [800/1000], Loss: 0.4901
Epoch [850/1000], Loss: 0.4862
Epoch [900/1000], Loss: 0.4827
Epoch [950/1000], Loss: 0.4794
Epoch [1000/1000], Loss: 0.4764


## Validation

* torch.no_grad()는 PyTorch에서 gradient 계산을 수행하지 않도록 하는 context manager입니다. 이를 사용하면 모델의 inference 과정에서 gradient 계산을 하지 않아 메모리 사용량을 줄일 수 있습니다. 또한 gradient 계산이 필요하지 않은 validation, test 데이터셋 등에서 사용하여 불필요한 계산을 방지할 수 있습니다.


In [13]:
# valid
with torch.no_grad():
    outputs = model(X_valid)
    y_valid_predict = (outputs >= 0.5).float()
    y_valid_predict = y_valid_predict.squeeze()
    
(y_valid.squeeze() == y_valid_predict).detach().numpy().mean()

0.7796610169491526

## Predict

In [14]:
# test
with torch.no_grad():
    outputs = model(X_test)
    y_predict = (outputs >= 0.5).float()
    y_predict = y_predict.squeeze().detach().numpy()

y_predict[:10]

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0.], dtype=float32)

In [15]:
pd.Series(y_predict).value_counts()

0.0    308
1.0    110
dtype: int64

## submit

In [16]:
submit = pd.read_csv("data/gender_submission.csv")
submit.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [17]:
submit["Survived"] = y_predict.astype(int)
submit.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0


In [18]:
submit.to_csv("submissions/submit.csv", index=False)
pd.read_csv("submissions/submit.csv").head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
