### Import library

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import lightgbm as lgb

### Load Data

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,TRAIN_6196,LUAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6197,TRAIN_6197,LGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6198,TRAIN_6198,COAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,T181S,WT
6199,TRAIN_6199,TGCT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


### Data Preprocessing

In [4]:
# SUBCLASS 가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: ACC, 변환된 숫자: 0
원래 레이블: BLCA, 변환된 숫자: 1
원래 레이블: BRCA, 변환된 숫자: 2
원래 레이블: CESC, 변환된 숫자: 3
원래 레이블: COAD, 변환된 숫자: 4
원래 레이블: DLBC, 변환된 숫자: 5
원래 레이블: GBMLGG, 변환된 숫자: 6
원래 레이블: HNSC, 변환된 숫자: 7
원래 레이블: KIPAN, 변환된 숫자: 8
원래 레이블: KIRC, 변환된 숫자: 9
원래 레이블: LAML, 변환된 숫자: 10
원래 레이블: LGG, 변환된 숫자: 11
원래 레이블: LIHC, 변환된 숫자: 12
원래 레이블: LUAD, 변환된 숫자: 13
원래 레이블: LUSC, 변환된 숫자: 14
원래 레이블: OV, 변환된 숫자: 15
원래 레이블: PAAD, 변환된 숫자: 16
원래 레이블: PCPG, 변환된 숫자: 17
원래 레이블: PRAD, 변환된 숫자: 18
원래 레이블: SARC, 변환된 숫자: 19
원래 레이블: SKCM, 변환된 숫자: 20
원래 레이블: STES, 변환된 숫자: 21
원래 레이블: TGCT, 변환된 숫자: 22
원래 레이블: THCA, 변환된 숫자: 23
원래 레이블: THYM, 변환된 숫자: 24
원래 레이블: UCEC, 변환된 숫자: 25


In [5]:
## x 의 경우도 범주형으로 구성되어 있어, 알맞은 인코딩 필요
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

categorical_columns = X.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = X.copy()
X_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

### Model Define and Train

In [10]:
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.005,
    max_depth=-1,  
    random_state=42,
    class_weight='balanced'
)

In [11]:
X_encoded = X_encoded.dropna()
y_subclass = y_subclass[X_encoded.index]

In [12]:
model.fit(X_encoded, y_subclass)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75427
[LightGBM] [Info] Number of data points in the train set: 6201, number of used features: 3205
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightGBM] [Info] Start training from score -3.258097
[LightG

In [13]:
y_subclass

0        8
1       19
2       20
3        9
4        6
        ..
6196    13
6197    11
6198     4
6199    22
6200    20
Name: SUBCLASS, Length: 6201, dtype: int64

In [14]:
X_encoded

Unnamed: 0,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,ABCA5,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
1,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
2,104.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
3,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
4,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
6197,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0
6198,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,30.0,40.0
6199,137.0,42.0,24.0,0.0,47.0,141.0,114.0,124.0,187.0,102.0,...,161.0,67.0,32.0,26.0,121.0,60.0,51.0,51.0,34.0,40.0


### Inference

In [15]:
test_X = test.drop(columns=['ID'])
X_encoded = test_X.copy()
X_encoded[categorical_columns] = ordinal_encoder.transform(test_X[categorical_columns])

In [16]:
predictions = model.predict(X_encoded)

In [17]:
original_labels = le_subclass.inverse_transform(predictions)


In [18]:
original_labels

array(['STES', 'UCEC', 'THCA', ..., 'UCEC', 'LAML', 'LGG'], dtype=object)

In [20]:
### Submission

In [22]:
submisson = pd.read_csv("./data/sample_submission.csv")


In [23]:
submisson["SUBCLASS"] = original_labels


In [26]:
submisson.to_csv('./result/lgb_baseline_submission.csv', encoding='UTF-8-sig', index=False)