In [1]:
# Kaggle Titanic Dataset - 타이타닉 생존자 예측 프로젝트

# 딥러닝 개발 프로세스
# Loading the Data
# - 데이터셋 다운로드 및 로드
# Data Preprocessing(Feature Engineering)
# - 주어진 데이터셋의 12개의 feature를 분석한 후에 12개의 feature 와 함께 타이타닉 승객의 생존율에 영향을 미칠 수 있는 새로운 5개 feature 정의
# - feature / label 분리 후 딥러닝 학습에 필수적인 표준화(Standadization) 수행
# Building Model
# - input layer / hidden layer / output layer 로 구성된 인공신경망(ANN) 구축
# Training Model
# - 평균정확도 80% 나올수 있도록 하이퍼 파라미터를 튜닝하면서 모델 학습 진행
# Survival Prediction
# - test 데이터로 생존율을 예측(Prediction)하고 결과를 kaggle에 제출(Submit)하여 전 세계 상위 5% 안에 들 수 있는 정확도(Survival Prediction Score) 달성

In [2]:
# 라이브러리, GPU 설정
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, zipfile, shutil

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Pytorch Version : ', torch.__version__, ', Device : ', DEVICE)

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Pytorch Version :  2.7.1+cu118 , Device :  cuda


In [3]:
# 데이터 폴더 존재시 삭제
if os.path.exists('.\\data\\titanic'):
    shutil.rmtree('.\\data\\titanic')
    print('.\\data\\titanic is removed')

.\data\titanic is removed


In [4]:
# kaggle titanic dataset 압축풀기
with zipfile.ZipFile(file='.\\data\\titanic.zip') as target_file:
    target_file.extractall(path='.\\data\\titanic')

In [5]:
# 데이터 불러오기
train = pd.read_csv('.\\data\\titanic\\train.csv')
test = pd.read_csv('.\\data\\titanic\\test.csv')

In [6]:
# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    
    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    return df[features]

In [7]:
# 커스텀 데이터셋 클래스
class TitanicDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

In [8]:
# 신경망 모델 정의
class TitanicNet(nn.Module):
    def __init__(self, input_size):
        super(TitanicNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),
            
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

In [9]:
# 데이터 전처리 및 스케일링
X = preprocess_data(train)
y = train['Survived'].to_numpy()

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 학습/검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(712, 7) (712,)
(179, 7) (179,)


In [12]:
# 데이터셋 및 데이터로더 생성
train_dataset = TitanicDataset(X_train, y_train)
val_dataset = TitanicDataset(X_val, y_val)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [14]:
# 모델 초기화
model = TitanicNet(input_size=X_train.shape[1]).to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
# 모델 학습
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    for features, targets in train_loader:
        features, targets = features.to(DEVICE), targets.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
    
    # 검증
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for features, targets in val_loader:
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                outputs = model(features)
                predicted = (outputs.squeeze() >= 0.5).float()
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
            
            accuracy = correct / total
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.4f}')

Epoch [10/200], Loss: 0.5822, Validation Accuracy: 0.8101
Epoch [20/200], Loss: 0.5944, Validation Accuracy: 0.8268
Epoch [30/200], Loss: 0.6718, Validation Accuracy: 0.8045
Epoch [40/200], Loss: 0.3529, Validation Accuracy: 0.8101
Epoch [50/200], Loss: 0.1885, Validation Accuracy: 0.8156
Epoch [60/200], Loss: 0.9046, Validation Accuracy: 0.8156
Epoch [70/200], Loss: 0.5334, Validation Accuracy: 0.8156
Epoch [80/200], Loss: 0.2497, Validation Accuracy: 0.8212
Epoch [90/200], Loss: 0.1719, Validation Accuracy: 0.8101
Epoch [100/200], Loss: 0.7826, Validation Accuracy: 0.8101
Epoch [110/200], Loss: 0.3600, Validation Accuracy: 0.8101
Epoch [120/200], Loss: 0.3784, Validation Accuracy: 0.8156
Epoch [130/200], Loss: 0.2306, Validation Accuracy: 0.8101
Epoch [140/200], Loss: 0.3575, Validation Accuracy: 0.8156
Epoch [150/200], Loss: 0.4106, Validation Accuracy: 0.8101
Epoch [160/200], Loss: 0.3173, Validation Accuracy: 0.7989
Epoch [170/200], Loss: 0.1499, Validation Accuracy: 0.8101
Epoch 

In [17]:
# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
test_dataset = TitanicDataset(test_scaled)
test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(DEVICE)
        outputs = model(features)
        predicted = (outputs.squeeze() >= 0.5).float()
        predictions.extend(predicted.cpu().numpy())

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('.\\data\\titanic\\submission.csv', index=False)
print('\n제출 파일이 생성되었습니다.')


제출 파일이 생성되었습니다.
