```python

modualer_model/ # 프로젝트 폴더
├── service/
│   ├── utils.py
│   ├── data.py # 데이터
│   ├── model.py # 모델
│   ├── process.py # 학습 프로세스
└── data/ # 학습 데이터
    └── train.csv
    ├── test.csv
    └── submission.csv
```

In [9]:
import os

for dir in ['service', 'data']:
    # 만약 해당 폴더가 없다면,
    if not os.path.exists(dir):
        # 해당 폴더를 만들어줘
        os.makedirs(dir)

In [10]:
%%writefile service/utils.py

import os
import random
import numpy as np
import torch

def reset_seeds(func, seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
  np.random.seed(seed)
  torch.manual_seed(seed) # cpu 연산 무작위 고정
  torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
  torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

  def wrapper_func(*args, **kwargs):
    return func(*args, **kwargs)

  return wrapper_func


Overwriting service/utils.py


In [11]:
%%writefile service/data.py

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from utils import reset_seeds

# 타이타닉 데이터 로드
def __load_data() -> pd.DataFrame:
    return pd.read_csv("./data/train.csv")

def __process_drop(train, test):
    drop_cols = ['name', 'ticket', 'cabin']

    train.drop(drop_cols, axis=1, inplace=True) # 모델이 학습하는데 사용하는 데이터
    test.drop(drop_cols, axis=1, inplace=True) # 모델의 학습을 평가(잘했는지?? 못했는지??)하기 위한 데이터

def __process_null(train, test):
    age_median = train['age'].median()
    fare_median = train['fare'].median()
    embarked_mode = train['embarked'].mode().values[0]

    train['age'].fillna(age_median, inplace=True)
    test['age'].fillna(age_median, inplace=True)

    train['fare'].fillna(fare_median, inplace=True)
    test['fare'].fillna(fare_median, inplace=True)

    train['embarked'].fillna(embarked_mode, inplace=True)
    test['embarked'].fillna(embarked_mode, inplace=True)

def __process_astype(train, test):
    train['gender'] = train['gender'].map({'female': 0, 'male': 1})
    test['gender'] = test['gender'].map({'female': 0, 'male': 1})

    train['gender'] = train['gender'].astype(int)
    test['gender'] = test['gender'].astype(int)

    train['embarked'] = train['embarked'].map({'C': 0, 'Q': 1, 'S': 2})
    test['embarked'] = test['embarked'].map({'C': 0, 'Q': 1, 'S': 2})

    train['embarked'] = train['embarked'].astype(int)
    test['embarked'] = test['embarked'].astype(int)


def __preprocess_data(train, test):
    print(f'before: {train.shape} / {test.shape}')

    __process_drop(train, test)
    __process_null(train, test)
    __process_astype(train, test)


def __process_encoding(train, test):
    enc_cols = ['gender', 'embarked']
    normal_cols = list(set(train.columns) - set(enc_cols))
    print(f'before: {train.shape} / {test.shape}')

    enc = OneHotEncoder()
    # train
    tmp_tr = pd.DataFrame(
        enc.fit_transform(train[enc_cols]).toarray(),
        columns = enc.get_feature_names_out()
    )
    enc_tr = pd.concat(
        [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
        , axis=1
    )
    # test
    tmp_te = pd.DataFrame(
        enc.transform(test[enc_cols]).toarray(),
        columns = enc.get_feature_names_out()
    )
    enc_te = pd.concat(
        [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
        , axis=1
    )
    # ori_test
    tmp_te = pd.DataFrame(
        enc.transform(ori_te[enc_cols]).toarray(),
        columns = enc.get_feature_names_out()
    )
    enc_ori_te = pd.concat(
        [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
        , axis=1
    )

    print(f'after: {enc_tr.shape} / {enc_te.shape}')
    enc_tr.head()

@reset_seeds
def preprocess_dataset():
    # 데이터 로드
    df_raw = __load_data()
    train, test = train_test_split(df_raw, test_size=0.2, stratify=df_raw['survived'])
    __preprocess_data(train, test)
    # __process_encoding(train, test)
    y = df_raw['survived']
    X = df_raw.drop(['survived'], axis=1)

    return train.drop(['survived'], axis=1), test.drop(['survived'], axis=1), train['survived'], test['survived']

Overwriting service/data.py


In [12]:
%%writefile service/model.py

from lightgbm import LGBMClassifier, plot_importance
from utils import reset_seeds

# 모델 생성 후 리턴
@reset_seeds
def get_model(hp:dict=None, model_nm:str=None):
    if not hp:
        hp = {}

    if not model_nm:
        return LGBMClassifier(**hp)
    elif model_nm == "LGBMClassifier":
        return LGBMClassifier(**hp)

Overwriting service/model.py


In [13]:
%%writefile service/process.py

import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from data import preprocess_dataset
from model import get_model
from utils import reset_seeds


@reset_seeds
def run_cross_validation(my_model, x_train, x_test, y_train, y_test, n_splits:int=3):
  # df_train -> feature
  df_train = pd.DataFrame(data=x_train)
  # df_train -> reatures & target
  df_train['label'] = y_train
  # df_train.shape

  # # 모델 정의
  # clf = svm.SVC(kernel='linear', C=1)

  # 교차 검증
  skf = StratifiedKFold(n_splits=5, shuffle=True)

  n_iter = 0
  accuracy_lst = []

  for train_index, valid_index in skf.split(df_train, df_train['label']):
    n_iter += 1
    # 학습용, 검증용 데이터 구성
    label_train = df_train['label'].iloc[train_index]
    label_valid = df_train['label'].iloc[valid_index]

    train_x, valid_x = x_train.iloc[train_index], x_train.iloc[valid_index]
    train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]
    # 학습
    my_model.fit(train_x, train_y)
    # 예측
    pred = my_model.predict(valid_x)
    # 평가
    accuracy = np.round(accuracy_score(valid_y, pred), 4)
    accuracy_lst.append(accuracy)
    print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {accuracy}, 학습데이터 label 분포: \n{label_train.value_counts()}, 검증데이터 label 분포: \n{label_valid.value_counts()}')

  # 최종 평가
  print('-'*50)
  print(f'교차 검증 정확도: {np.mean(accuracy_lst)} / 모델 평가: {my_model.score(x_test, y_test)}')
  return np.mean(accuracy_lst)



# def run_cross_validation(my_model, x_train, y_train, n_splits:int=5):
#     # 교차 검증
#     kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
#     n_iter = 0
#     accuracy_lst = []

#     for train_index, valid_index in kf.split(x_train):
#       n_iter += 1
#       # 학습용, 검증용 데이터 구성
#       train_x, valid_x = x_train.iloc[train_index], x_train.iloc[valid_index]
#       train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]
#       # 학습
#       my_model.fit(train_x, train_y)
#       # 예측
#       pred = my_model.predict(valid_x)
#       # 평가
#       accuracy = np.round(accuracy_score(valid_y, pred), 4)
#       accuracy_lst.append(accuracy)
#       print(f'{n_iter} 번째 K-fold 정확도: {accuracy}, 학습데이터 크기: {train_x.shape}, 검증데이터 크기: {valid_x.shape}')

#     return np.mean(accuracy_lst)

@reset_seeds
def main():
    # 데이터 로드 및 분류
    X_train, X_test, y_train, y_test = preprocess_dataset()
    # 모델 생성
    my_model = get_model()
    # 모델 학습
    accuracy = run_cross_validation(my_model, X_train, X_test, y_train, y_test)

    # 테스트 데이터 예측
    return my_model.score(X_test, y_test)

if __name__=="__main__":
  result = main()
  print(f"테스트 스코어는 {result}")

Overwriting service/process.py


In [14]:
!python3 service/process.py

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

before: (732, 12) / (184, 12)
[LightGBM] [Info] Number of positive: 220, number of negative: 365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 585, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376068 -> initscore=-0.506270
[LightGBM] [Info] Start training from score -0.506270
1 번째 Stratified Stratified K-Fold 정확도: 0.898, 학습데이터 label 분포: 
label
0    365
1    220
Name: count, dtype: int64, 검증데이터 label 분포: 
label
0    91
1    56
Name: count, dtype: int64
[LightGBM] [Info] Number of positive: 221, nu