# 241021
- Augumentation

In [1]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

2024-10-21 01:14:09.513006: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-21 01:14:09.530627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-21 01:14:09.549772: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-21 01:14:09.555489: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-21 01:14:09.570195: I tensorflow/core/platform/cpu_feature_guar

True

In [2]:
# 1. 데이터 로드 및 전처리
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train.fillna('WT', inplace=True)
test.fillna('WT', inplace=True)

mutation_columns = [col for col in train.columns if col not in ['ID', 'SUBCLASS']]

train['mutations'] = train[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
test['mutations'] = test[mutation_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

label_encoder = LabelEncoder()
train['SUBCLASS_encoded'] = label_encoder.fit_transform(train['SUBCLASS'])
num_classes = len(label_encoder.classes_)

X = train['mutations']
y = train['SUBCLASS_encoded']

# 클래스 분포 확인
class_counts = train['SUBCLASS_encoded'].value_counts()
print("클래스 분포:")
print(class_counts)

클래스 분포:
SUBCLASS_encoded
2     786
8     515
6     461
21    379
9     334
23    324
20    276
18    266
15    253
11    229
7     223
4     223
25    198
19    198
13    184
14    178
12    158
10    158
3     155
17    147
22    124
16    120
1     104
24     98
0      72
5      38
Name: count, dtype: int64


In [3]:
# 2. 데이터 증강 기법 정의 (EDA)
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word != 'WT']))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return new_words

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and counter < 10:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = wordnet.synsets(random_word)
        counter +=1
    if synonyms:
        synonym = synonyms[0].lemmas()[0].name()
        random_idx = random.randint(0, len(new_words)-1)
        new_words.insert(random_idx, synonym)

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        idx1 = random.randint(0, len(new_words)-1)
        idx2 = random.randint(0, len(new_words)-1)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    return new_words

def random_deletion(words, p):
    if len(words) == 1:
        return words
    new_words = [word for word in words if random.uniform(0,1) > p]
    if len(new_words) == 0:
        return [words[random.randint(0,len(words)-1)]]
    return new_words

def eda(sentence, num_aug=4):
    words = sentence.split()
    num_words = len(words)
    augmented_sentences = []
    n_sr = max(1, int(0.1*num_words))
    n_ri = max(1, int(0.1*num_words))
    n_rs = max(1, int(0.1*num_words))
    # 동의어 교체
    a_words = synonym_replacement(words, n_sr)
    augmented_sentences.append(' '.join(a_words))
    # 무작위 삽입
    a_words = random_insertion(words, n_ri)
    augmented_sentences.append(' '.join(a_words))
    # 무작위 교환
    a_words = random_swap(words, n_rs)
    augmented_sentences.append(' '.join(a_words))
    # 무작위 삭제
    a_words = random_deletion(words, p=0.1)
    augmented_sentences.append(' '.join(a_words))
    return augmented_sentences

# 3. 소수 클래스에 대한 데이터 증강
max_class_count = class_counts.max()

augmented_texts = []
augmented_labels = []

for class_label in class_counts.index:
    count = class_counts[class_label]
    if count < max_class_count:
        df_class = train[train['SUBCLASS_encoded'] == class_label]
        texts = df_class['mutations'].tolist()
        augment_count = max_class_count - count
        i = 0
        while augment_count > 0:
            text = texts[i % len(texts)]
            aug_texts = eda(text)
            for aug_text in aug_texts:
                augmented_texts.append(aug_text)
                augmented_labels.append(class_label)
                augment_count -= 1
                if augment_count == 0:
                    break
            i += 1

# 증강된 데이터프레임 생성
augmented_df = pd.DataFrame({'mutations': augmented_texts, 'SUBCLASS_encoded': augmented_labels})

# 원본 데이터와 증강된 데이터 결합
train_augmented = pd.concat([train[['mutations', 'SUBCLASS_encoded']], augmented_df], ignore_index=True)

# 클래스 분포 재확인
print("증강 후 클래스 분포:")
print(train_augmented['SUBCLASS_encoded'].value_counts())

# 4. 토크나이저 및 시퀀스 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_augmented['mutations'])

X_sequences = tokenizer.texts_to_sequences(train_augmented['mutations'])
X_test_sequences = tokenizer.texts_to_sequences(test['mutations'])

# 시퀀스 패딩
max_seq_length = max(len(seq) for seq in X_sequences)
print(f"최대 시퀀스 길이: {max_seq_length}")

X_padded = pad_sequences(X_sequences, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_seq_length, padding='post')

# 타겟 레이블
y = train_augmented['SUBCLASS_encoded']

# 5. 훈련 세트와 검증 세트로 분할
X_train, X_val, y_train, y_val = train_test_split(
    X_padded, y, test_size=0.2, random_state=42, stratify=y
)

증강 후 클래스 분포:
SUBCLASS_encoded
8     786
19    786
20    786
9     786
6     786
21    786
2     786
23    786
12    786
7     786
16    786
15    786
18    786
25    786
10    786
4     786
0     786
11    786
14    786
13    786
3     786
17    786
24    786
1     786
22    786
5     786
Name: count, dtype: int64
최대 시퀀스 길이: 12355


In [4]:
# 6. 딥러닝 모델 정의
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# 7. 모델 컴파일
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# 8. 모델 학습
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,
    callbacks=[early_stopping]
)

# 9. 모델 평가
y_val_pred_probs = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred_probs, axis=1)

print("검증 데이터 성능 평가:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

2024-10-21 01:19:06.184542: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22990 MB memory:  -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:03:00.0, compute capability: 7.5
2024-10-21 01:19:06.185223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22982 MB memory:  -> device: 1, name: NVIDIA TITAN RTX, pci bus id: 0000:73:00.0, compute capability: 7.5


Epoch 1/50


2024-10-21 01:19:15.127083: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8902


[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 1s/step - accuracy: 0.0375 - loss: 3.2618 - val_accuracy: 0.0416 - val_loss: 3.2573
Epoch 2/50
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 2s/step - accuracy: 0.0401 - loss: 3.2581 - val_accuracy: 0.0409 - val_loss: 3.2631
Epoch 3/50
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 2s/step - accuracy: 0.0423 - loss: 3.2621 - val_accuracy: 0.0634 - val_loss: 3.1774
Epoch 4/50
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 2s/step - accuracy: 0.0886 - loss: 3.0273 - val_accuracy: 0.1959 - val_loss: 2.4560
Epoch 5/50
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 2s/step - accuracy: 0.1993 - loss: 2.3703 - val_accuracy: 0.3694 - val_loss: 1.9542
Epoch 6/50
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 2s/step - accuracy: 0.3485 - loss: 1.8551 - val_accuracy: 0.2546 - val_loss: 2.1472
Epoch 7/50
[1m256/256[0m [32m━

In [5]:
# 10. 테스트 데이터 예측 및 제출 파일 생성
y_test_pred_probs = model.predict(X_test_padded)
y_test_pred = np.argmax(y_test_pred_probs, axis=1)

test['SUBCLASS'] = label_encoder.inverse_transform(y_test_pred)

submission = test[['ID', 'SUBCLASS']]
submission.to_csv('submission_LSTM_augmented.csv', index=False)
print("제출 파일이 생성되었습니다: submission_LSTM_augmented.csv")

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 504ms/step
제출 파일이 생성되었습니다: submission_LSTM_augmented.csv
