# 암환자 유전체 데이터 기반 암종 분류 AI 모델 개발


- '2024 생명연구자원 AI활용 경진대회'는 바이오 데이터를 기반으로 한 AI 기술의 문제 해결 능력을 탐구하는 것을 목표로 합니다. <br>이 대회는 바이오 분야에서 AI 활용의 저변을 확대하고, 복잡한 바이오 데이터를 효율적으로 분석 및 해석할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br><br>
- 본 대회의 구체적인 과제는 암환자 유전체 데이터의 변이 정보를 활용하여 암종을 분류하는 AI 모델을 개발하는 것입니다. <br>참가자들은 제공된 학습 데이터셋(암환자 유전체 변이 정보)을 사용하여 특정 변이 정보를 바탕으로 암종을 정확하게 분류할 수 있는 AI 알고리즘을 개발해야 합니다. <br><br>
- 이 대회의 궁극적인 목적은 바이오 데이터의 활용도를 높이고, 바이오 분야에서 AI 기술의 적용 가능성을 극대화하며, 인공지능 기술이 실제 바이오 의료 문제 해결에 어떻게 기여할 수 있는지 탐구하는 것입니다.

# Import library

In [134]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# import xgboost as xgb

In [135]:
# 평가용 함수
def print_score(clf, x_train,y_train,x_test,y_test, train=True):
    if train:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train,pred,output_dict=True,zero_division=0))
        print("Train Result : \n ==============================================")
        print(f"Accuracy score : {accuracy_score(y_train,pred)*100:.2f}%")
        print("---------------------------------------------------------------")
        print(f"Calssfication Report:\n{clf_report}")
        print('---------------------------------------------------------------')
        print(f'Confusion Matrix: \n{confusion_matrix(y_train,pred)}\n')
    else:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test,pred,output_dict=True,zero_division=0))
        print("Test Result : \n ==============================================")
        print(f"Accuracy score : {accuracy_score(y_test,pred)*100:.2f}%")
        print("---------------------------------------------------------------")
        print(f"Calssfication Report:\n{clf_report}")
        print('---------------------------------------------------------------')
        print(f'Confusion Matrix: \n{confusion_matrix(y_test,pred)}\n')

# Load Data

In [136]:
# mainPath = 'C:/pandas/암환자 유전체 데이터 기반 암종 분류 AI 모델_data'
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [137]:
train.head()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


# Data Preprocessing

In [138]:
# SUBCLASS 가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

# 변환된 레이블 확인
# for i, label in enumerate(le_subclass.classes_):
#     print(f"원래 레이블: {label}, 변환된 숫자: {i}")

In [139]:
## x 의 경우도 범주형으로 구성되어 있어, 알맞은 인코딩 필요
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

categorical_columns = X.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = X.copy()
X_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

In [140]:
# X_encoded  범주형 데이터는 순서가 있는 라벨링으로 변환한 학습용 데이터
print(X_encoded.shape, y_subclass.shape)

(6201, 4384) (6201,)


In [141]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print(Counter(y_subclass))
x_train,x_test,y_train,y_test = train_test_split(X_encoded,y_subclass, train_size=0.99, stratify=y_subclass,random_state=42)
rds = SMOTE(random_state=42)
X_encoded_resample, y_subclass_resample = rds.fit_resample(X_encoded, y_subclass)
print(Counter(y_subclass_resample))

Counter({2: 786, 8: 515, 6: 461, 21: 379, 9: 334, 23: 324, 20: 276, 18: 266, 15: 253, 11: 229, 7: 223, 4: 223, 19: 198, 25: 198, 13: 184, 14: 178, 12: 158, 10: 158, 3: 155, 17: 147, 22: 124, 16: 120, 1: 104, 24: 98, 0: 72, 5: 38})
Counter({8: 786, 19: 786, 20: 786, 9: 786, 6: 786, 21: 786, 2: 786, 23: 786, 12: 786, 7: 786, 16: 786, 15: 786, 18: 786, 25: 786, 10: 786, 4: 786, 0: 786, 11: 786, 14: 786, 13: 786, 3: 786, 17: 786, 24: 786, 1: 786, 22: 786, 5: 786})


# Model Define and Train

In [142]:
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline

# x_train,x_test,y_train,y_test = train_test_split(X_encoded,y_subclass,stratify=y_subclass,random_state=42)

# pipeline = Pipeline([
#     ('std_scaler',StandardScaler()), 
#     ('logistic',LogisticRegression(max_iter=10000,C=0.1,penalty='l1',solver = "liblinear" ))
#     ])
# pipeline.fit(x_train, y_train)

In [143]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier() # max_depth=80, criterion='entropy'
# tree.fit(X_encoded_resample, y_subclass_resample)
tree.fit(X_encoded, y_subclass)

In [144]:
# import matplotlib.pyplot as plt
# from sklearn.tree import plot_tree
# plt.figure(figsize=(10,10))
# plot_tree(tree)
# plt.show()

In [145]:
print_score(tree,x_train,y_train,x_test,y_test,train=True)

Train Result : 
Accuracy score : 91.77%
---------------------------------------------------------------
Calssfication Report:
              0      1           2           3      4     5           6  \
precision   1.0    1.0    1.000000    1.000000    1.0   1.0    0.725240   
recall      1.0    1.0    0.994859    0.980392    1.0   1.0    0.995614   
f1-score    1.0    1.0    0.997423    0.990099    1.0   1.0    0.839187   
support    71.0  103.0  778.000000  153.000000  221.0  38.0  456.000000   

               7           8           9  ...          19     20          21  \
precision    1.0    0.652455    0.983607  ...    1.000000    1.0    1.000000   
recall       1.0    0.990196    0.181269  ...    0.984694    1.0    0.994667   
f1-score     1.0    0.786604    0.306122  ...    0.992288    1.0    0.997326   
support    221.0  510.000000  331.000000  ...  196.000000  273.0  375.000000   

              22          23         24     25  accuracy    macro avg  \
precision    1.0    0.99

In [146]:
print_score(tree,x_train,y_train,x_test,y_test,train=False)

Test Result : 
Accuracy score : 90.48%
---------------------------------------------------------------
Calssfication Report:
             0    1         2    3    4         6    7         8    9   10  \
precision  1.0  1.0  1.000000  1.0  1.0  0.833333  1.0  0.625000  0.0  1.0   
recall     1.0  1.0  0.875000  1.0  1.0  1.000000  1.0  1.000000  0.0  1.0   
f1-score   1.0  1.0  0.933333  1.0  1.0  0.909091  1.0  0.769231  0.0  1.0   
support    1.0  1.0  8.000000  2.0  2.0  5.000000  2.0  5.000000  3.0  2.0   

           ...   19   20   21   22   23        24   25  accuracy  macro avg  \
precision  ...  1.0  1.0  1.0  1.0  1.0  0.333333  1.0  0.904762   0.871667   
recall     ...  1.0  1.0  1.0  1.0  1.0  1.000000  1.0  0.904762   0.915000   
f1-score   ...  1.0  1.0  1.0  1.0  1.0  0.500000  1.0  0.904762   0.884466   
support    ...  2.0  3.0  4.0  1.0  3.0  1.000000  2.0  0.904762  63.000000   

           weighted avg  
precision      0.867063  
recall         0.904762  
f1-score  

In [147]:
# print_score(pipeline,x_train,y_train,x_test,y_test,train=True)
# print_score(pipeline,x_train,y_train,x_test,y_test,train=False)

In [148]:
#---------------------------------- 모델개발이 완료되면 정답 생성 --------------------

In [149]:
model = tree

In [150]:
# model.fit(X_encoded, y_subclass)

In [151]:
# test.head()

# Inference

In [152]:
test_X = test.drop(columns=['ID'])
X_encoded = test_X.copy()
X_encoded[categorical_columns] = ordinal_encoder.transform(test_X[categorical_columns])

In [153]:
# 테스트 데이터로 예측
predictions = model.predict(X_encoded)

In [154]:
predictions

array([21,  1, 21, ..., 25, 10,  6])

In [155]:
original_labels = le_subclass.inverse_transform(predictions)
original_labels

array(['STES', 'BLCA', 'STES', ..., 'UCEC', 'LAML', 'GBMLGG'],
      dtype=object)

# Submisson

In [156]:
submisson = pd.read_csv("./sample_submission.csv")
submisson.head()

Unnamed: 0,ID,SUBCLASS
0,TEST_0000,LGG
1,TEST_0001,LGG
2,TEST_0002,LGG
3,TEST_0003,LGG
4,TEST_0004,LGG


In [157]:
submisson["SUBCLASS"] = original_labels

In [158]:
submisson.to_csv('./baseline_submission.csv', encoding='UTF-8-sig', index=False)

In [159]:
print('end')

end
