In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
import warnings
warnings.filterwarnings(action = 'ignore')

# pd.read_csv()로 csv 파일 읽어들이기
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

# Identify object-type columns excluding 'Status'
object_columns = train_org.select_dtypes(include=['object']).columns.drop('Status')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Convert object-type columns to numerical using Label Encoding
for col in object_columns:
    train_org[col] = label_encoder.fit_transform(train_org[col])
    test_org[col] = label_encoder.transform(test_org[col])

# 수치형 입력 데이터, 범주형 입력 데이터, 출력 데이터로 구분하기
train_X_num = train_org[['N_Days', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']]
train_X_cat = train_org[object_columns]
train_y = train_org['Status']

# test 데이터도 동일하게 처리
test_X_num = test_org[['N_Days', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']]
test_X_cat = test_org[object_columns]

# 수치형 입력 데이터 전처리
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X_num)
test_X_scaled = scaler.transform(test_X_num)

# Numpy Data인 X_scaled 데이터를 DataFrame으로 변환
train_X_scaled = pd.DataFrame(data=train_X_scaled, index=train_X_num.index, columns=train_X_num.columns)
test_X_scaled = pd.DataFrame(data=test_X_scaled, index=test_X_num.index, columns=test_X_num.columns)

# 입력 데이터 통합
train_X = pd.concat([train_X_scaled, train_X_cat], axis=1)
test_X = pd.concat([test_X_scaled, test_X_cat], axis=1)

# train_test_split() 함수로 학습 데이터와 검증 데이터 분리하기
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.3, random_state=1)

# 'Status' 열 매핑
status_mapping = {'C': 0, 'CL': 1, 'D': 2}
train_org['Status'] = train_org['Status'].map(status_mapping)
y_train = train_org['Status'].loc[X_train.index]  # X_train에 해당하는 'Status' 값을 y_train에 할당
y_val = train_org['Status'].loc[X_val.index]      # X_val에 해당하는 'Status' 값을 y_val에 할당

# XGBClassifier 모델 생성/학습
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)

# Predict를 수행하고 classification_report() 결과 출력하기
pred = model_xgb.predict(X_val)
print(classification_report(y_val, pred))

# 'Status'에 대한 원-핫 인코딩 적용
encoder = OneHotEncoder()
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_val_encoded = encoder.transform(y_val.values.reshape(-1, 1)).toarray()

# One-vs-Rest 방식을 사용한 다중 클래스 분류기 생성 및 학습
ovr_model_xgb = OneVsRestClassifier(XGBClassifier())
ovr_model_xgb.fit(X_train, y_train_encoded)

# 테스트 데이터에 대한 확률 예측
test_predictions_probs = model_xgb.predict_proba(test_X)

# 예측된 확률을 submission 파일에 추가
# 열 이름을 'id', 'Status_C', 'Status_CL', 'Status_D'로 설정
column_names = ['Status_C', 'Status_CL', 'Status_D']
submission = pd.DataFrame(test_predictions_probs, columns=column_names)
submission = pd.concat([sample_submission['id'], submission], axis=1)

# 결과를 submission_ver2.csv 파일로 저장
submission.to_csv('../data/submission_ver3.csv', index=False)


              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1479
           1       0.56      0.17      0.26        82
           2       0.76      0.73      0.74       811

    accuracy                           0.81      2372
   macro avg       0.72      0.60      0.62      2372
weighted avg       0.80      0.81      0.80      2372



In [2]:
from supervised import AutoML

In [3]:
# AutoML 객체 초기화
automl = AutoML(mode="Compete")

In [4]:
# 모델 학습
automl.fit(X_train, y_train)

AutoML directory: AutoML_2
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree logloss 0.480908 trained in 0.25 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree logloss 0.532945 trained in 1.53 seconds
2_DecisionTree logloss 0.538758 trained in 1.41 seconds
3_DecisionTree logloss 0.5

In [5]:
# 테스트 데이터에 대한 예측 수행
predictions = automl.predict(test_X)

In [11]:
# 예측 결과를 제출 파일에 저장
submission['Status'] = predictions
submission.to_csv('../data/automl_submission1.csv', index=False)