In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import re


In [2]:
# 파일을 읽어옵니다.
file_path = '../train_data/train_sequence.csv'
df = pd.read_csv(file_path)

In [8]:
df.head()

Unnamed: 0,SUBCLASS,BRCA1,HMGB3,PLXNB2,NOTCH2,RYR2,BTG1,SMC1A,PIM1,FBXW7,...,DCC,PEX6,B2M,MYLK,BTG2,CACNA1B,PABPC1,FGFR3,CLIP2,CDH1
0,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,SARC,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,SKCM,WT,WT,WT,MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYHN...,WT,WT,WT,WT,WT,...,WT,MALAVLRVLEPFPTETPPLAVLLPPGGPWPAAELGLVLALRPAGES...,WT,MGDVKLVASSHISKTSLSVDPSRVDSMPLTEAPAFILPPRNLCIKE...,WT,WT,WT,WT,WT,WT
3,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [12]:
# 각 행에서 'WT'를 제거하고 남은 값들을 리스트로 만듭니다.
df_filtered_lists = df.iloc[:,1:].apply(lambda row: row[row != 'WT'].tolist(), axis=1)

# 결과를 새로운 데이터프레임으로 저장합니다.
df_sequence = pd.DataFrame(df_filtered_lists, columns=['Filtered_List'])

# 결과를 확인합니다.
print(df_sequence.head())


                                       Filtered_List
0  [MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQ...
1  [MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTD...
2  [MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYH...
3  [MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFE...
4  [MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLML...


In [3]:
# 염기서열 토큰을 추가합니다.
ALL_AAS = 'ACDEFGHIKLMNPQRSTUVWXY'
ADDITIONAL_TOKENS = ['<OTHER>', '<START>', '<END>', '<PAD>', '<SEP>']

# 시퀀스에 추가되는 토큰 수
ADDED_TOKENS_PER_SEQ = 2

# 각 염기서열과 추가 토큰에 대한 인덱스 매핑을 생성합니다.
n_aas = len(ALL_AAS)
aa_to_token_index = {aa: i for i, aa in enumerate(ALL_AAS)}
additional_token_to_index = {token: i + n_aas for i, token in enumerate(ADDITIONAL_TOKENS)}
token_to_index = {**aa_to_token_index, **additional_token_to_index}
index_to_token = {index: token for token, index in token_to_index.items()}
n_tokens = len(token_to_index)

def tokenize_seq(seq):
    """
    시퀀스를 토큰화하여 <START>와 <END> 토큰으로 감싸고, 염기서열을 <SEP>로 연결합니다.
    'WT'는 건너뜁니다.
    """
    other_token_index = additional_token_to_index['<OTHER>']
    sep_token_index = additional_token_to_index['<SEP>']
    
    # 'WT'를 제외한 염기서열을 필터링하여 <SEP>로 구분하여 토큰화합니다.
    tokenized_sequence = [additional_token_to_index['<START>']]
    
    for part in seq:
        if part != 'WT':
            parsed_part = parse_seq(part)
            tokenized_part = [aa_to_token_index.get(aa, other_token_index) for aa in parsed_part]
            tokenized_sequence.extend(tokenized_part + [sep_token_index])
    
    # 마지막 <SEP>을 제거하고 <END> 토큰을 추가합니다.
    if tokenized_sequence[-1] == sep_token_index:
        tokenized_sequence = tokenized_sequence[:-1]
        
    tokenized_sequence.append(additional_token_to_index['<END>'])
    
    return tokenized_sequence

def parse_seq(seq):
    """
    시퀀스의 타입을 확인하여 문자열로 변환합니다.
    """
    if isinstance(seq, str):
        return seq
    elif isinstance(seq, bytes):
        return seq.decode('utf8')
    else:
        raise TypeError('Unexpected sequence type: %s' % type(seq))


In [4]:
# 사용 예시
example_seq = ['WT', 'ACDEFG', 'WT', 'GHIKLM']
tokenized_result = tokenize_seq(example_seq)
print(tokenized_result)


[23, 0, 1, 2, 3, 4, 5, 26, 5, 6, 7, 8, 9, 10, 24]


In [34]:
max(tokenized_result)

26

In [37]:
# Filtered_List 컬럼에 tokenize_seq 함수를 적용하여 Tokenized_Sequence 컬럼을 생성합니다.
df_sequence['Tokenized_Sequence'] = df_sequence['Filtered_List'].apply(tokenize_seq)

In [38]:
df_sequence

Unnamed: 0,Filtered_List,Tokenized_Sequence
0,[MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQ...,"[23, 10, 4, 8, 8, 9, 8, 13, 8, 7, 15, 3, 3, 13..."
1,[MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTD...,"[23, 10, 16, 0, 3, 12, 10, 15, 3, 15, 8, 9, 11..."
2,[MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYH...,"[23, 10, 12, 0, 9, 14, 12, 0, 9, 9, 19, 0, 9, ..."
3,[MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFE...,"[23, 10, 14, 12, 15, 5, 16, 0, 5, 0, 0, 9, 9, ..."
4,[MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLML...,"[23, 10, 3, 3, 12, 13, 15, 2, 12, 15, 18, 3, 1..."
...,...,...
6196,[MADGGEGEDEIQFLRTDDEVVLQCTATIHKEQQKLCLAAEGFGNR...,"[23, 10, 0, 2, 5, 5, 3, 5, 3, 2, 3, 7, 13, 4, ..."
6197,[MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGV...,"[23, 10, 16, 0, 7, 7, 8, 3, 7, 18, 15, 14, 11,..."
6198,[MNQELLSVGSKRRRTGGSLRGNPSSSQVDEEQMNRVVEEEQQQQL...,"[23, 10, 11, 13, 3, 9, 9, 15, 18, 5, 15, 8, 14..."
6199,[MTLDRPGEGATMLKTFTVLLFCIRMSLGMTSIVMDPQPELWIESN...,"[23, 10, 16, 9, 2, 14, 12, 5, 3, 5, 0, 16, 10,..."


In [39]:
max_length = df_sequence['Tokenized_Sequence'].apply(len).max()
max_length

174930

In [43]:
df_sequence['SUBCLASS'] = df['SUBCLASS']
df_sequence

Unnamed: 0,Filtered_List,Tokenized_Sequence,SUBCLASS
0,[MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQ...,"[23, 10, 4, 8, 8, 9, 8, 13, 8, 7, 15, 3, 3, 13...",KIPAN
1,[MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTD...,"[23, 10, 16, 0, 3, 12, 10, 15, 3, 15, 8, 9, 11...",SARC
2,[MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYH...,"[23, 10, 12, 0, 9, 14, 12, 0, 9, 9, 19, 0, 9, ...",SKCM
3,[MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFE...,"[23, 10, 14, 12, 15, 5, 16, 0, 5, 0, 0, 9, 9, ...",KIRC
4,[MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLML...,"[23, 10, 3, 3, 12, 13, 15, 2, 12, 15, 18, 3, 1...",GBMLGG
...,...,...,...
6196,[MADGGEGEDEIQFLRTDDEVVLQCTATIHKEQQKLCLAAEGFGNR...,"[23, 10, 0, 2, 5, 5, 3, 5, 3, 2, 3, 7, 13, 4, ...",LUAD
6197,[MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGV...,"[23, 10, 16, 0, 7, 7, 8, 3, 7, 18, 15, 14, 11,...",LGG
6198,[MNQELLSVGSKRRRTGGSLRGNPSSSQVDEEQMNRVVEEEQQQQL...,"[23, 10, 11, 13, 3, 9, 9, 15, 18, 5, 15, 8, 14...",COAD
6199,[MTLDRPGEGATMLKTFTVLLFCIRMSLGMTSIVMDPQPELWIESN...,"[23, 10, 16, 9, 2, 14, 12, 5, 3, 5, 0, 16, 10,...",TGCT


In [44]:
# 최종 결과를 CSV 파일로 저장
df_sequence.to_csv('../train_data/train_sequence_tok.csv', index=False)
df_sequence.to_pickle('../train_data/train_sequence_tok.pkl')


In [58]:
df_sequence.iloc[1,1].count(26)

2

In [60]:
len(df_sequence.iloc[1,0])

3

In [65]:
check_list = []
idx_list = []
for i in range(len(df_sequence)):
    if df_sequence.iloc[i,1].count(26) + 1 == len(df_sequence.iloc[i,0]):
        check_list.append(False)
    else:
        idx_list.append(i)

In [63]:
len(check_list)

5361

In [64]:
len(df_sequence)

6201

In [71]:
df_sequence.iloc[102,:]

Filtered_List               []
Tokenized_Sequence    [23, 24]
SUBCLASS                  THYM
Name: 102, dtype: object

In [66]:
idx_list

[7,
 18,
 23,
 28,
 35,
 57,
 61,
 66,
 82,
 88,
 93,
 99,
 101,
 102,
 108,
 114,
 115,
 122,
 123,
 124,
 129,
 130,
 133,
 138,
 140,
 149,
 155,
 161,
 163,
 168,
 176,
 200,
 204,
 207,
 214,
 219,
 227,
 235,
 243,
 251,
 254,
 256,
 261,
 265,
 266,
 291,
 293,
 299,
 306,
 311,
 312,
 325,
 329,
 330,
 336,
 349,
 352,
 359,
 365,
 368,
 369,
 378,
 382,
 389,
 397,
 408,
 411,
 412,
 413,
 418,
 422,
 426,
 429,
 432,
 440,
 451,
 453,
 464,
 465,
 475,
 484,
 491,
 501,
 533,
 549,
 551,
 553,
 558,
 566,
 569,
 576,
 584,
 589,
 591,
 594,
 595,
 606,
 610,
 618,
 631,
 632,
 636,
 659,
 660,
 663,
 665,
 671,
 676,
 683,
 689,
 692,
 698,
 707,
 710,
 716,
 726,
 729,
 733,
 750,
 759,
 764,
 768,
 781,
 787,
 788,
 810,
 813,
 815,
 817,
 819,
 825,
 830,
 844,
 853,
 856,
 872,
 879,
 881,
 894,
 895,
 908,
 909,
 913,
 919,
 968,
 973,
 974,
 992,
 995,
 997,
 1003,
 1006,
 1007,
 1028,
 1033,
 1040,
 1049,
 1050,
 1057,
 1071,
 1073,
 1089,
 1092,
 1100,
 1104,
 1121,
 