## 요구사항 1
#### titanic_dataset.py 분석

In [13]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split


class TitanicDataset(Dataset):
  def __init__(self, X, y):
    # X 텐서는 float 실수형, y 텐서는 정수형 0, 1
    self.X = torch.FloatTensor(X)
    self.y = torch.LongTensor(y)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    target = self.y[idx]
    return {'input': feature, 'target': target}

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.X), self.X.shape, self.y.shape
    )
    return str


class TitanicTestDataset(Dataset):
  def __init__(self, X):
    self.X = torch.FloatTensor(X)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    return {'input': feature}

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}".format(
      len(self.X), self.X.shape
    )
    return str

In [14]:
def get_preprocessed_dataset():
    CURRENT_FILE_PATH = '/home/cuj/Python/kaggle_titanic'

    train_data_path = os.path.join(CURRENT_FILE_PATH, "train.csv")
    test_data_path = os.path.join(CURRENT_FILE_PATH, "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    all_df = pd.concat([train_df, test_df], sort=False)
    # 데이터를 csv로부터 불러온 후 6번의 전처리 과정을 거쳐 데이터셋 구축

    all_df = get_preprocessed_dataset_1(all_df)

    all_df = get_preprocessed_dataset_2(all_df)

    all_df = get_preprocessed_dataset_3(all_df)

    all_df = get_preprocessed_dataset_4(all_df)

    all_df = get_preprocessed_dataset_5(all_df)

    all_df = get_preprocessed_dataset_6(all_df)

    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]

    test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)

    dataset = TitanicDataset(train_X.values, train_y.values)
    #print(dataset)
    train_dataset, validation_dataset = random_split(dataset, [0.8, 0.2])
    test_dataset = TitanicTestDataset(test_X.values)
    #print(test_dataset)

    return train_dataset, validation_dataset, test_dataset


def get_preprocessed_dataset_1(all_df):
    # Pclass별 Fare 평균값을 사용하여 Fare 결측치 메우기
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]

    return all_df


def get_preprocessed_dataset_2(all_df):
    # name을 세 개의 컬럼으로 분리하여 다시 all_df에 합침
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "honorific", "name"]
    name_df["family_name"] = name_df["family_name"].str.strip()
    name_df["honorific"] = name_df["honorific"].str.strip()
    name_df["name"] = name_df["name"].str.strip()
    all_df = pd.concat([all_df, name_df], axis=1)

    return all_df


def get_preprocessed_dataset_3(all_df):
    # honorific별 Age 평균값을 사용하여 Age 결측치 메우기
    honorific_age_mean = all_df[["honorific", "Age"]].groupby("honorific").median().round().reset_index()
    honorific_age_mean.columns = ["honorific", "honorific_age_mean", ]
    all_df = pd.merge(all_df, honorific_age_mean, on="honorific", how="left")
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["honorific_age_mean"]
    all_df = all_df.drop(["honorific_age_mean"], axis=1)

    return all_df


def get_preprocessed_dataset_4(all_df):
    # 가족수(family_num) 컬럼 새롭게 추가
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]

    # 혼자탑승(alone) 컬럼 새롭게 추가
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"].fillna(0, inplace=True)

    # 학습에 불필요한 컬럼 제거
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)

    return all_df


def get_preprocessed_dataset_5(all_df):
    # honorific 값 개수 줄이기
    all_df.loc[
    ~(
            (all_df["honorific"] == "Mr") |
            (all_df["honorific"] == "Miss") |
            (all_df["honorific"] == "Mrs") |
            (all_df["honorific"] == "Master")
    ),
    "honorific"
    ] = "other"
    all_df["Embarked"].fillna("missing", inplace=True)

    return all_df


def get_preprocessed_dataset_6(all_df):
    # 카테고리 변수를 LabelEncoder를 사용하여 수치값으로 변경하기
    category_features = all_df.columns[all_df.dtypes == "object"]
    from sklearn.preprocessing import LabelEncoder
    for category_feature in category_features:
        le = LabelEncoder()
        if all_df[category_feature].dtypes == "object":
          le = le.fit(all_df[category_feature])
          all_df[category_feature] = le.transform(all_df[category_feature])

    return all_df

In [15]:
from torch import nn
class MyModel(nn.Module):
  def __init__(self, n_input, n_output):
    super().__init__()

    # 모델 구성 -> (bn, in) -> (bn, 30) -> (bn, 30) -> (bn, out)
    self.model = nn.Sequential(
      nn.Linear(n_input, 30),
      nn.ReLU(),
      nn.Linear(30, 30),
      nn.ReLU(),
      nn.Linear(30, n_output),
    )

  def forward(self, x):
    x = self.model(x)
    return x

In [16]:
def test(test_data_loader):
  print("[TEST]")
  batch = next(iter(test_data_loader))
  print("{0}".format(batch['input'].shape))
  my_model = MyModel(n_input=11, n_output=2)
  output_batch = my_model(batch['input'])
  prediction_batch = torch.argmax(output_batch, dim=1)
  for idx, prediction in enumerate(prediction_batch, start=892):
      print(idx, prediction.item())


if __name__ == "__main__":
  # csv 형태의 데이터 셋을 분할
  train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()

  print("train_dataset: {0}, validation_dataset.shape: {1}, test_dataset: {2}".format(
    len(train_dataset), len(validation_dataset), len(test_dataset)
  ))
  print("#" * 50, 1)

  for idx, sample in enumerate(train_dataset):
    print("{0} - {1}: {2}".format(idx, sample['input'], sample['target']))

  print("#" * 50, 2)
  # 3개의 데이터로더 및 각각의 데이터로더 출력
  train_data_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
  validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=16, shuffle=True)
  test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

  print("[TRAIN]")
  for idx, batch in enumerate(train_data_loader):
    print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))

  print("[VALIDATION]")
  for idx, batch in enumerate(validation_data_loader):
    print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))

  print("#" * 50, 3)

  test(test_data_loader)

train_dataset: 713, validation_dataset.shape: 178, test_dataset: 418
################################################## 1
0 - tensor([ 3.0000,  1.0000, 30.0000,  0.0000,  0.0000,  7.8958,  2.0000, 13.3029,
         2.0000,  0.0000,  1.0000]): 0
1 - tensor([ 3.0000,  1.0000, 29.0000,  0.0000,  0.0000,  7.8958,  0.0000, 13.3029,
         2.0000,  0.0000,  1.0000]): 0
2 - tensor([ 3.0000,  0.0000, 21.0000,  1.0000,  0.0000,  9.8250,  2.0000, 13.3029,
         1.0000,  1.0000,  0.0000]): 0
3 - tensor([ 3.0000,  1.0000, 26.0000,  0.0000,  0.0000,  7.8875,  2.0000, 13.3029,
         2.0000,  0.0000,  1.0000]): 0
4 - tensor([ 1.0000,  1.0000, 62.0000,  0.0000,  0.0000, 26.5500,  2.0000, 87.5090,
         2.0000,  0.0000,  1.0000]): 0
5 - tensor([ 3.0000,  0.0000,  8.0000,  3.0000,  1.0000, 21.0750,  2.0000, 13.3029,
         1.0000,  4.0000,  0.0000]): 0
6 - tensor([ 2.0000,  0.0000, 45.0000,  0.0000,  0.0000, 13.5000,  2.0000, 21.1792,
         3.0000,  0.0000,  1.0000]): 1
7 - tensor([ 1.00

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["alone"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["Embarked"].fillna("missing", inplace=True)


## 요구사항 2
#### titanic 딥러닝 모델 훈련 코드 및 Activate Function 변경해보기
#### Sigmoid, ReLU, LeakyReLU로 실험을 진행해보았지만 LeakyReLU의 성능이 가장 높았음

In [2]:
import os, torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder
import torchmetrics
import random
import numpy as np
from torch import nn, optim
from datetime import datetime
import wandb
import argparse

In [3]:
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

        # 데이터 정규화
        X_min = self.X.min(dim=0, keepdim=True).values
        X_max = self.X.max(dim=0, keepdim=True).values
        self.X = (self.X - X_min) / (X_max - X_min)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class MyModel(nn.Module):
    def __init__(self, n_input, n_output, activate):
        super().__init__()
        
        # 모델 전개 흐름
        """
        (batch, 7) -> fc(batch, n_hidden[0]) -> bn_1d(batch, n_hidden[0]) -> sig(batch, n_hidden[0])
        -> fc(batch, n_hidden[n]) -> bn_1d(batch, n_hidden[n]) -> sig(batch, n_hidden[n])
        -> fc(batch, 1)
        """
        layers = []
        layers.append(nn.Linear(n_input, wandb.config.n_hidden_unit_list[0]))
        layers.append(nn.BatchNorm1d(num_features=wandb.config.n_hidden_unit_list[0]))
        layers.append(activate)
        for i in range(1, len(wandb.config.n_hidden_unit_list)):
            layers.append(nn.Linear(wandb.config.n_hidden_unit_list[i-1], wandb.config.n_hidden_unit_list[i]))
            layers.append(nn.BatchNorm1d(num_features=wandb.config.n_hidden_unit_list[i]))
            layers.append(activate)
        layers.append(nn.Linear(wandb.config.n_hidden_unit_list[-1], n_output))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        x = self.model(x)
        return x

In [4]:
BASE_PATH = '/home/cuj/Python/kaggle_titanic'

import sys
sys.path.append(BASE_PATH)

def prepresentation():
    train_data_path = os.path.join(BASE_PATH, "train.csv")
    test_data_path = os.path.join(BASE_PATH, "test.csv")
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    train_df_size = len(train_df)

    data = pd.concat([train_df, test_df], sort=False)

    # 1. 데이터 전처리
    # 'Age'의 결측값을 중앙값으로 채움
    # 'Embarked'의 결측값을 가장 많이 등장한 값으로 채움
    data.fillna({'Age': data['Age'].median(), 'Embarked': data['Embarked'].mode()[0], 'Fare': data['Fare'].median()}, inplace=True)
    # 'Sex'와 'Embarked'를 숫자로 인코딩(0, 1, 2, 3)
    label_encoder = LabelEncoder()
    data['Sex'] = label_encoder.fit_transform(data['Sex'])
    data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

    train_df, test_df = data[:train_df_size], data[train_df_size:]
    # train_df와 test_df로 나누고 X, y로 분할
    # train_df에서 학습 및 평가를 진행하고 test_df 데이터를 예측
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    train_X = train_df[features].values
    train_y = train_df['Survived'].values
    test_X = test_df[features].values
    test_y = test_df['Survived'].values

    return train_X, train_y, test_X, test_y


def get_data(X, y):
    # train_X, train_y를 입력받아 데이터셋으로 저장
    titanic_dataset = TitanicDataset(X, y)
    
    # 트레이닝 데이터셋과 평가 데이터셋을 8:2로 분할
    train_dataset, validation_dataset = random_split(titanic_dataset, [0.8, 0.2])
    print(len(train_dataset), len(validation_dataset))

    train_data_loader = DataLoader(dataset=train_dataset, batch_size=wandb.config.batch_size, shuffle=True, drop_last=wandb.config.dropout)
    validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=len(validation_dataset))

    return train_data_loader, validation_data_loader

def get_model_and_optimizer(activate='ReLU', optimizer='SGD', lr = 1e-3):
    # 활성화 함수 매핑 딕셔너리
    activation_functions = {
        'ReLU': nn.ReLU(),
        'Sigmoid': nn.Sigmoid(),
        'LeakyReLU': nn.LeakyReLU()
    }
    my_model = MyModel(n_input=7, n_output=1, activate=activation_functions[activate])

    optimizer_functions = {
        'SGD': optim.SGD(my_model.parameters(), lr=lr),
        'Adam': optim.Adam(my_model.parameters(), lr=lr)
    }
    optimizer = optimizer_functions[optimizer]

    return my_model, optimizer

In [8]:
def training_loop(model, optimizer, train_data_loader, validation_data_loader, next_print_epoch = 20):
    # 정확도 평가는 binary 정확도 계산으로 진행
    # Loss는 MSE loss 이용
    n_epochs = wandb.config.epochs
    loss_fn = nn.MSELoss()  # Use a built-in loss function
    print_epoch = next_print_epoch
    accuracy_metric = torchmetrics.Accuracy(task="binary")

    for epoch in range(1, n_epochs + 1):
        model.train()
        loss_train = 0.0
        num_trains = 0
        for train_batch in train_data_loader:
            # input 데이터로 예측한 것과 target 데이터 간의 loss 계산
            input, target = train_batch
            output_train = model(input)
            loss = loss_fn(output_train, target.view(-1, 1))
            loss_train += loss.item()
            num_trains += 1

            # loss 역전파 및 파라미터 업데이트
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss_validation = 0.0
        num_validations = 0
        accuracy_metric.reset()
        
        with torch.no_grad():
            model.eval()
            for validation_batch in validation_data_loader:
                # 평가 또한 동일한 절차로 진행, 추가로 accuracy_metric을 이용해 정확도 계산
                input, target = validation_batch
                output_validation = model(input)
                loss = loss_fn(output_validation, target.view(-1, 1))
                loss_validation += loss.item()
                _, t = validation_batch
                accuracy_metric.update((output_validation >= 0.5).float(), t.view(-1, 1))
                num_validations += 1

        # 로그 출력
        wandb.log({
            "Training loss": loss_train / num_trains,
            "Validation loss": loss_validation / num_validations
        })

        if epoch >= print_epoch:
            print(
                f"Epoch {epoch}, "
                f"Training loss {loss_train / num_trains:.4f}, "
                f"Validation loss {loss_validation / num_validations:.4f}, "
                f"Accuracy {accuracy_metric.compute() * 100}"
            )
            print_epoch += next_print_epoch

## 요구사항 3
## 테스트 및 submission.csv 생성

In [12]:
parser = argparse.ArgumentParser()

# argparse를 이용해 batchsize, epochs, activate, optimizer, dropout 여부를 메인함수에서 결정
parser.add_argument(
"--wandb", action=argparse.BooleanOptionalAction, default=True, help="True or False"
)
parser.add_argument(
"-b", "--batch_size", type=int, default=8, help="Batch size (int, default: 512)"
)
parser.add_argument(
"-e", "--epochs", type=int, default=300, help="Number of training epochs (int, default:1_000)"
)
"""
activate Function ReLU, Sigmoid, LeakyReLU를 이용해보았음.
"""
parser.add_argument(
"-a", "--activate", type=str, default="LeakyReLU", help="ReLU, Sigmoid, LeakyReLu"
)
parser.add_argument(
"-o", "--optimizer", type=str, default="SGD", help="SGD, Adam"
)
parser.add_argument(
"-d", "--dropout", type=bool, default=True, help="True, False"
)

args = parser.parse_args(args=[])

current_time_str = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

config = {
    'epochs': args.epochs,
    'batch_size': args.batch_size,
    'learning_rate': 3e-3,
    'n_hidden_unit_list': [16, 32, 16, 8],
    'activate': args.activate,
    'optimizer': args.optimizer,
    'dropout': args.dropout,
}

# 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
seed_everything()

wandb.init(
    mode="online" if args.wandb else "disabled",
    project="my_model_training",
    notes="My first wandb experiment",
    tags=["my_model", "california_housing"],
    name=current_time_str,
    config=config
)
print(args)
print(wandb.config)

# train.csv 데이터 전처리 진행(결측값 채우기)
train_X, train_y, test_X, _ = prepresentation()
# train_X와 train_y를 가지고 전처리 및 데이터셋 구성
train_data_loader, validation_data_loader = get_data(train_X, train_y)

# 트레이닝 model 선언
linear_model, optimizer = get_model_and_optimizer(
    activate=wandb.config.activate,
    optimizer=wandb.config.optimizer,
    lr=wandb.config.learning_rate
)

print("#" * 50, 1)

# 트레이닝 진행
training_loop(
    model=linear_model,
    optimizer=optimizer,
    train_data_loader=train_data_loader,
    validation_data_loader=validation_data_loader,
    next_print_epoch=50
)
wandb.finish()

Namespace(wandb=True, batch_size=8, epochs=300, activate='LeakyReLU', optimizer='SGD', dropout=True)
{'epochs': 300, 'batch_size': 8, 'learning_rate': 0.003, 'n_hidden_unit_list': [16, 32, 16, 8], 'activate': 'LeakyReLU', 'optimizer': 'SGD', 'dropout': True}
713 178
################################################## 1
Epoch 50, Training loss 0.1526, Validation loss 0.1355, Accuracy 83.14606475830078
Epoch 100, Training loss 0.1520, Validation loss 0.1269, Accuracy 84.26966094970703
Epoch 150, Training loss 0.1460, Validation loss 0.1208, Accuracy 84.26966094970703
Epoch 200, Training loss 0.1479, Validation loss 0.1170, Accuracy 84.26966094970703
Epoch 250, Training loss 0.1440, Validation loss 0.1213, Accuracy 84.83145904541016
Epoch 300, Training loss 0.1418, Validation loss 0.1179, Accuracy 84.26966094970703


0,1
Training loss,█▇▇▆▃▄▄▃▅▄▅▄▃▄▄▃▃▄▄▂▂▃▄▄▃▁▃▅▃▃▃▃▄▂▄▄▃▂▃▃
Validation loss,█▅▅▄▄▄▃▃▄▃▃▃▂▂▃▂▃▃▃▂▂▂▃▂▂▁▁▂▂▁▁▂▁▁▂▂▁▂▂▂

0,1
Training loss,0.1418
Validation loss,0.11791


In [88]:
# 모델 학습이 이루어지지는 않으므로 test_y는 임의의 len(test_X), 1 크기의 텐서로 정의
test_dataset = TitanicDataset(test_X, np.zeros((len(test_X), 1)))

# 출력 데이터 프레임 정의
predictions = dict()
predictions['PassengerId'] = []
predictions['Survived'] = []

# test 데이터 예측
linear_model.eval()
with torch.no_grad():
    idx = len(train_X)
    for input_data, _ in test_dataset:
        idx += 1
        input_data = input_data.unsqueeze(dim=0)
        predictions['PassengerId'].append(idx)
        predictions['Survived'].append(int((linear_model(input_data) >= 0.5)))
df = pd.DataFrame(predictions)

print(df)
# submission.csv 파일을 제출
df.to_csv('linear_model_pred.csv', index=False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


## 요구사항 4 submission.csv 제출 및 등수 확인
![img](https://raw.githubusercontent.com/cudje/DeapLearning-Class/refs/heads/main/submitResult.png)

## 숙제 후기

직접 fully-conected layer 모델을 변경해보고, 여러 파라미터를 바꾸면서 실험을 진행했었는데,\
acuraccy가 계속 바뀌어 좋은 모델의 configuration을 찾는 것이 어렵다는 것을 느꼈습니다.\
이외에 모델을 학습하는 데 있어, non-linear한 성질을 갖도록 적절한 activate function을 설정해주는 것이 중요하다는 것과\
layer-depth가 깊을 수록 모델의 성능 또한 개선된다는 것을 알게 되었습니다.