In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 데이터 로드
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 결측값 처리
train.fillna('WT', inplace=True)
test.fillna('WT', inplace=True)

# 변이 열 준비
mutation_columns = [col for col in train.columns if col not in ['ID', 'SUBCLASS']]

# 변이 정보를 문자열로 결합
train['mutations'] = train[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
test['mutations'] = test[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# 타겟 레이블 인코딩
label_encoder = LabelEncoder()
train['SUBCLASS'] = label_encoder.fit_transform(train['SUBCLASS'])
num_classes = len(label_encoder.classes_)

# 특징과 타겟 준비
X = train['mutations']
y = train['SUBCLASS']

# 텍스트 데이터를 숫자 특징으로 변환하기 위해 TF-IDF 벡터화를 사용
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(X)
X_test_tfidf = tfidf.transform(test['mutations'])

# 학습 데이터를 훈련 세트와 검증 세트로 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)


# H2O 라이브러리 임포트
import h2o
from h2o.automl import H2OAutoML

# H2O 초기화
h2o.init()

# 데이터를 H2O 프레임으로 변환
htrain = h2o.H2OFrame(pd.DataFrame(X_train.toarray()))
htrain['SUBCLASS'] = h2o.H2OFrame(y_train.values.reshape(-1,1))

hvalid = h2o.H2OFrame(pd.DataFrame(X_valid.toarray()))
hvalid['SUBCLASS'] = h2o.H2OFrame(y_valid.values.reshape(-1,1))

# 특징 및 타겟 지정
x_cols = htrain.columns[:-1]
y_col = 'SUBCLASS'

# AutoML 실행
aml = H2OAutoML(
    max_runtime_secs=3600,  # 총 시간 제한 (초)
    seed=42,
    balance_classes=True,
    nfolds=5
)

aml.train(x=x_cols, y=y_col, training_frame=htrain, validation_frame=hvalid)

# AutoML 리더보드 확인
lb = aml.leaderboard
print(lb)

# 검증 데이터에 대한 예측
preds = aml.leader.predict(hvalid)

# 성능 평가
from sklearn.metrics import classification_report

y_pred = preds.as_data_frame()['predict'].astype(int)
print("H2O AutoML Classification Report:")
print(classification_report(y_valid, y_pred, target_names=label_encoder.classes_))

# 테스트 데이터에 대한 예측
htest = h2o.H2OFrame(pd.DataFrame(X_test_tfidf.toarray()))
test_preds = aml.leader.predict(htest)
test_pred_labels = test_preds.as_data_frame()['predict'].astype(int)
test['SUBCLASS'] = label_encoder.inverse_transform(test_pred_labels)

# 제출 파일 생성
submission = test[['ID', 'SUBCLASS']]
submission.to_csv('submission_h2o.csv', index=False)
print("H2O AutoML 제출 파일이 생성되었습니다: submission_h2o.csv")

# H2O 세션 종료
h2o.shutdown(prompt=False)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu320.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu320.04, mixed mode, sharing)
  Starting server from /opt/conda/envs/automl2/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpm3hqwro4
  JVM stdout: /tmp/tmpm3hqwro4/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpm3hqwro4/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 16 days
H2O_cluster_name:,H2O_from_python_unknownUser_y0tapy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| 100%| 100%