In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
df = pd.read_csv("data/feature.csv")
df.shape

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
nan_replacements = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
full_data_cln = df.fillna(nan_replacements)

In [None]:
X = pd.get_dummies(df.drop(["is_canceled"], axis=1)).astype(float)
y = df[["is_canceled"]]

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
y_ohe = ohe.fit_transform(y)
y_ohe = y_ohe.toarray()
print(ohe.get_feature_names_out())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X.values, y_ohe, test_size=0.33, random_state=42)


X_train = torch.Tensor(X_train)
X_valid = torch.Tensor(X_valid)
y_train = torch.Tensor(y_train)
y_valid = torch.Tensor(y_valid)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X_train.shape[1], 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, y_train.shape[1]),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 
print(model)

In [None]:
# 학습
num_epochs = 1000
best_val_loss = 0.0
num_bad_epochs = 0
early_stop_patience = 50

criterion = nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    # forward + backward + optimize
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()

    # 중간 결과 출력
    if (epoch+1) % 100 == 0:
        print('Epoch [{}/{}], Train Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

    # 검증 데이터에 대한 성능 측정
    with torch.no_grad():
        model.eval()
        val_outputs = model(X_valid)
        val_loss = criterion(val_outputs.squeeze(), y_valid)

        # 검증 데이터에 대한 정확도 계산
        val_preds = (val_outputs > 0.5).float()
        val_acc = (val_preds == y_valid).float().mean()

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc.item():.4f}')

        # 검증 데이터에 대한 Loss 가 early_stop_patience번 연속 개선되지 않으면 조기 종료
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            num_bad_epochs = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            num_bad_epochs += 1
            if num_bad_epochs == early_stop_patience:
                print("Early stopping")
                break