In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = '/content/drive/MyDrive/DLproject/dataset/unsmile_dataset.txt'
data = pd.read_csv(file_path, delimiter='\t')

# Display the first few rows of the dataset
print(data.head())

# Define the labels
labels = ['여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭']

# Text cleaning function
def clean_text(text):
    # Remove non-alphanumeric characters and extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z가-힣0-9\s]", "", text)
    return text.strip()

# Clean the '문장' column
data['문장'] = data['문장'].apply(clean_text)

# Convert labels to a list of labels
def extract_labels(row):
    return [labels[i] for i in range(len(labels)) if row[i+1] == 1]

data['labels'] = data.apply(extract_labels, axis=1)

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['문장'])
sequences = tokenizer.texts_to_sequences(data['문장'])

# Pad the sequences
maxlen = 100  # You can adjust the maxlen according to your needs
X = pad_sequences(sequences, maxlen=maxlen)

# Binarize the labels
mlb = MultiLabelBinarizer(classes=labels)
y = mlb.fit_transform(data['labels'])

# Print the shape of the dataset
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

                                                  문장  여성/가족  남성  성소수자  인종/국적  \
0  ㅇㄱㄹㅇ 진짜 죽어도 상관없다는 마인드로 싸웠더니 지금 서열 상타취노 식칼들고 니가...      0   1     0      0   
1                         여자들은 취미가 애낳는건가.. 취미를 좀 가져라      1   0     0      0   
2                           개슬람녀 다 필요없고 니 엄마만 있으면 된다      0   0     0      1   
3  조팔ㅋㅋ 남한 길거리 돌아다니면 한국남자때문에 눈재기하는데 그걸 내 폰에 굳이 담아...      0   1     0      0   
4                              바지 내리다 한남들 와꾸 보고 올려뿟노      0   1     0      0   

   연령  지역  종교  기타 혐오  악플/욕설  clean  개인지칭  
0   0   0   0      0      0      0     0  
1   0   0   0      0      0      0     0  
2   0   0   1      0      0      0     0  
3   0   0   0      0      0      0     0  
4   0   0   0      0      0      0     0  
Shape of X: (18742, 100)
Shape of y: (18742, 11)


In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 79471
등장 빈도가 2번 이하인 희귀 단어의 수: 71542
단어 집합에서 희귀 단어의 비율: 90.02277560367932
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 49.58030465836546


In [None]:
# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
# 0번 패딩 토큰과 1번 OOV 토큰을 고려하여 +2
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)

단어 집합의 크기 : 7931


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
embedding_dim = 128


model_01 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y.shape[1], activation='sigmoid')  # Use 'sigmoid' for multi-label classification
])

model_01.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_01.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          10172416  
                                                                 
 conv1d_2 (Conv1D)           (None, 96, 128)           82048     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 48, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_3 (Conv1D)           (None, 44, 128)           82048     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 128)              

In [None]:
batch_size = 32
epochs = 10

from tensorflow.keras.callbacks import Callback
import numpy as np

#초반엔 오버피팅이 일어나서 얼리스탑 룰을 적용해본다...

class EarlyStoppingByAccuracyDiff(Callback):
    def __init__(self, monitor='accuracy', value=0.4, verbose=0):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        train_acc = logs.get('accuracy')
        val_acc = logs.get('val_accuracy')

        if train_acc and val_acc:
            acc_diff = np.abs(train_acc - val_acc)
            if acc_diff >= self.value:
                if self.verbose > 0:
                    print(f"\nEpoch {epoch + 1}: early stopping triggered as accuracy difference {acc_diff:.4f} is greater than {self.value}")
                self.model.stop_training = True

# Create an instance of the custom callback
early_stopping_by_acc_diff = EarlyStoppingByAccuracyDiff(monitor='accuracy', value=0.4, verbose=1)

# Use the callback in model training
history = model_01.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1, callbacks=[early_stopping_by_acc_diff])



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping triggered as accuracy difference 0.4020 is greater than 0.4


1. 필터 크기 다양화
2. CNN레이어 여러개 쌓음
3. 배치 정규화 추가
4. Golbal Average Pooling 사용

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 정의
model_02 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=7, activation='relu'),
    BatchNormalization(),
    GlobalAveragePooling1D(),  # Global Average Pooling 사용
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y.shape[1], activation='sigmoid')
])

model_02.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_02.summary()

# 모델 학습
history = model_02.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 128)          10172416  
                                                                 
 conv1d_14 (Conv1D)          (None, 98, 128)           49280     
                                                                 
 batch_normalization_11 (Ba  (None, 98, 128)           512       
 tchNormalization)                                               
                                                                 
 conv1d_15 (Conv1D)          (None, 94, 128)           82048     
                                                                 
 batch_normalization_12 (Ba  (None, 94, 128)           512       
 tchNormalization)                                               
                                                                 
 conv1d_16 (Conv1D)          (None, 88, 128)          

1. 필터 크기 다양화
2. CNN레이어 여러개 쌓음
3. 배치 정규화 추가
4. Attention Mechanism 사용

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Attention Mechanism 구현
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        e = tf.keras.backend.squeeze(e, axis=-1)
        alpha = tf.keras.backend.softmax(e)
        alpha = tf.keras.backend.expand_dims(alpha, axis=-1)
        context = x * alpha
        context = tf.keras.backend.sum(context, axis=1)
        return context

# 모델 정의
model_03 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=7, activation='relu'),
    BatchNormalization(),
    Attention(),  # Attention Mechanism 추가
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y.shape[1], activation='sigmoid')
])

model_03.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_03.summary()

# 모델 학습
history = model_03.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 128)          10172416  
                                                                 
 conv1d_22 (Conv1D)          (None, 98, 128)           49280     
                                                                 
 batch_normalization_19 (Ba  (None, 98, 128)           512       
 tchNormalization)                                               
                                                                 
 conv1d_23 (Conv1D)          (None, 94, 128)           82048     
                                                                 
 batch_normalization_20 (Ba  (None, 94, 128)           512       
 tchNormalization)                                               
                                                                 
 conv1d_24 (Conv1D)          (None, 88, 128)          

오버피팅 문제가 해결되지 않으므로
모델을 단순화하고 k-fold 방식을 실행해 본다


In [16]:
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, BatchNormalization, GlobalMaxPooling1D

# K-fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 결과 저장용 리스트
val_accuracies = []
val_losses = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # 모델 정의
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Conv1D(filters=64, kernel_size=5, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(y.shape[1], activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



    # 모델 학습
    history = model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), batch_size=32)

    # 모델 평가
    val_loss, val_accuracy = model.evaluate(X_val, y_val)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

# 최종 평균 성능 출력
print(f'Mean Validation Loss: {np.mean(val_losses)}, Mean Validation Accuracy: {np.mean(val_accuracies)}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Validation Loss: 0.2997813820838928, Validation Accuracy: 0.26407042145729065
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Validation Loss: 0.3067648410797119, Validation Accuracy: 0.2760736048221588
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Validation Loss: 0.31523165106773376, Validation Accuracy: 0.25747063755989075
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Validation Loss: 0.3757050931453705, Validation Accuracy: 0.26467448472976685
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Validation Loss: 0.3030795753002167, Validation Accuracy: 0.2750800549983978
Mean Validation Loss: 0.32011250853538514, Mean Validation Accuracy: 0.267473840713501
