# 1. Data Preprocessing

In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. 데이터 로드 및 전처리
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 결측값 처리
train.fillna('WT', inplace=True)
test.fillna('WT', inplace=True)

# 변이 열 준비
mutation_columns = [col for col in train.columns if col not in ['ID', 'SUBCLASS']]

# 변이 정보를 문자열로 결합
train['mutations'] = train[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
test['mutations'] = test[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# 타겟 레이블 인코딩
label_encoder = LabelEncoder()
train['SUBCLASS'] = label_encoder.fit_transform(train['SUBCLASS'])
num_classes = len(label_encoder.classes_)

# 특징과 타겟 준비
X = train['mutations']
y = train['SUBCLASS'].astype(int)  # 정수형으로 변환

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf.fit_transform(X)
X_test_tfidf = tfidf.transform(test['mutations'])

# 학습 데이터를 훈련 세트와 검증 세트로 분할
X_train, X_valid, y_train, y_valid = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

# 희소 행렬을 밀집 배열로 변환
X_train_dense = X_train.toarray()
X_valid_dense = X_valid.toarray()
X_test_dense = X_test_tfidf.toarray()

# 2. MLJAR AutoML 임포트
from supervised.automl import AutoML

# 3. AutoML 객체 생성 (Random Forest 알고리즘 추가)
automl = AutoML(
    mode='Compete',
    ml_task='multiclass_classification',  # 작업 유형 명시적으로 지정
    total_time_limit=3600,
    eval_metric='logloss',
    algorithms=['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost'],  # 알고리즘에 Random Forest 추가
    validation_strategy={
        "validation_type": "split",
        "train_ratio": 0.8,
        "shuffle": True,
        "stratify": True
    }
)

# 4. AutoML 모델 학습
automl.fit(X_train_dense, y_train)

# 5. 검증 데이터에 대한 예측
y_pred = automl.predict(X_valid_dense)

# 6. 성능 평가
print("MLJAR AutoML Classification Report with Random Forest:")
print(classification_report(y_valid, y_pred, target_names=label_encoder.classes_))

# 7. 테스트 데이터에 대한 예측
y_test_pred = automl.predict(X_test_dense)
test['SUBCLASS'] = label_encoder.inverse_transform(y_test_pred.astype(int))

# 8. 제출 파일 생성
submission = test[['ID', 'SUBCLASS']]
submission.to_csv('submission_mljar.csv', index=False)
print("MLJAR AutoML 제출 파일이 생성되었습니다: submission_mljar.csv")


  from .autonotebook import tqdm as notebook_tqdm
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Disable stacking for split validation
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 4 models
1_Default_LightGBM logloss 2.498436 trained in 193.38 seconds
2_Default_Xgboost logloss 2.319693 trained in 177.9 seconds
3_Default_CatBoost logloss 2.276217 trained in 327.24 seconds
4_Default_RandomForest logloss 2.535947 trained in 52.97 seconds
* Step not_so_random will try to check up to 36 models
14_LightGBM logloss 2.457309 trained in 54.65 seconds
5_Xgboost logloss 2.459094 trained in 288.46 seconds
23_CatBoost logloss 2.269