In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = '/content/drive/MyDrive/DLproject/dataset/KHS_dataset.txt'
data = pd.read_csv(file_path, delimiter=',')

# Display the first few rows of the dataset
print(data.head())

# Define the labels
labels = ['contain_gender' , 'bias' , 'hate']
# Text cleaning function
def clean_text(text):
    # Remove non-alphanumeric characters and extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z가-힣0-9\s]", "", text)
    return text.strip()

# Clean the 'comments' column
data['comments'] = data['comments'].apply(clean_text)

# Convert labels to a list of labels
def extract_labels(row):
    return [labels[i] for i in range(len(labels)) if row[i+1] == 1]

data['labels'] = data.apply(extract_labels, axis=1)

# Tokenize the sentences
max_words = 10000 #수정
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['comments'])
sequences = tokenizer.texts_to_sequences(data['comments'])

# Pad the sequences
maxlen = 100  # You can adjust the maxlen according to your needs
X = pad_sequences(sequences, maxlen=maxlen)

# Binarize the labels
mlb = MultiLabelBinarizer(classes=labels)
y = mlb.fit_transform(data['labels'])

# Print the shape of the dataset
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')




                                            comments  contain_gender_bias  \
0  (현재 호텔주인 심정) 아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속...                    0   
1  ....한국적인 미인의 대표적인 분...너무나 곱고아름다운모습...그모습뒤의 슬픔을...                    0   
2  ...못된 넘들...남의 고통을 즐겼던 넘들..이젠 마땅한 처벌을 받아야지..,그래...                    0   
3                 1,2화 어설펐는데 3,4화 지나서부터는 갈수록 너무 재밌던데                    0   
4  1. 사람 얼굴 손톱으로 긁은것은 인격살해이고2. 동영상이 몰카냐? 메걸리안들 생각...                    1   

   bias  hate  
0     1     1  
1     0     0  
2     0     1  
3     0     0  
4     1     1  
Shape of X: (7896, 100)
Shape of y: (7896, 3)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 37086
등장 빈도가 2번 이하인 희귀 단어의 수: 33817
단어 집합에서 희귀 단어의 비율: 91.1853529633824
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 57.118334776691825


In [None]:
# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
# 0번 패딩 토큰과 1번 OOV 토큰을 고려하여 +2
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)

단어 집합의 크기 : 31709


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

model_01 = Sequential()
model_01.add(Embedding(max_words, 128, input_length=maxlen))
model_01.add(Conv1D(128, 5, activation='relu'))
model_01.add(GlobalMaxPooling1D())
model_01.add(Dense(128, activation='relu'))
model_01.add(Dropout(0.5))
model_01.add(Dense(len(labels), activation='sigmoid'))

model_01.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_01.summary()

Shape of X_train: (6316, 100)
Shape of y_train: (6316, 3)
Shape of X_test: (1580, 100)
Shape of y_test: (1580, 3)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           82048     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense

In [None]:
model_01.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



# Train the model
history = model_01.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_sample = pd.DataFrame(y[:50], columns=labels)
print(y_sample)

    contain_gender  bias  hate
0                0     1     1
1                0     0     0
2                0     0     1
3                0     0     0
4                1     1     1
5                0     0     0
6                1     1     1
7                0     0     0
8                0     0     1
9                0     0     1
10               0     1     0
11               0     0     0
12               0     0     0
13               0     0     1
14               0     0     0
15               0     1     1
16               0     0     0
17               0     1     0
18               0     0     0
19               1     1     1
20               0     0     1
21               0     0     0
22               0     0     0
23               0     0     1
24               0     0     0
25               0     0     0
26               0     0     1
27               0     0     0
28               0     0     0
29               0     0     0
30               0     0     0
31      

데이터 불균형 여부를 확인

In [None]:
# 데이터 불균형 문제 파악
y_df = pd.DataFrame(y, columns=labels)

# Calculate the distribution of each combination
combination_distribution = y_df.groupby(labels).size().reset_index(name='count')

# Print the distribution
print(combination_distribution)

   contain_gender  bias  hate  count
0               0     0     0   3273
1               0     0     1   1875
2               0     1     0    137
3               0     1     1   1379
4               1     1     0     76
5               1     1     1   1156


In [None]:
from nltk.corpus import wordnet
import random
import nltk
from nltk.corpus import stopwords

# NLTK의 stopwords 로드
nltk.download('stopwords')
nltk.download('wordnet')

# 원본 데이터프레임에 각 라벨을 개별 컬럼으로 분리
for label in labels:
    data[label] = data['labels'].apply(lambda x: 1 if label in x else 0)

# 부족한 클래스 조합 확인
combination_distribution = data.groupby(labels).size().reset_index(name='count')
print("Class combination distribution before augmentation:")
print(combination_distribution)

# 증강할 클래스 조합 기준 설정 (예: 기준보다 적은 샘플을 가진 조합들에 대해 증강)
augmentation_threshold = 1000  # 적절한 기준값 설정
combinations_to_augment = combination_distribution[combination_distribution['count'] < augmentation_threshold]

# 증강 기법 정의
def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords.words('english')]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # n개의 단어를 치환하면 종료
            break
    return ' '.join(new_words)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return synonyms

def random_insertion(sentence, n):
    words = sentence.split()
    for _ in range(n):
        new_word = random.choice(words)
        synonyms = get_synonyms(new_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            insert_pos = random.randint(0, len(words)-1)
            words.insert(insert_pos, synonym)
    return ' '.join(words)

def random_deletion(sentence, p):
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

def random_swap(sentence, n):
    words = sentence.split()
    length = len(words)
    for _ in range(n):
        idx1, idx2 = random.sample(range(length), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# 데이터 증강 함수 (다양한 기법 사용)
def augment_data(row, n_synonym_replacements=2, n_random_insertions=2, n_random_deletions=2, n_random_swaps=2):
    augmented_sentences = []

    # 동의어 치환
    augmented_sentences.append(synonym_replacement(row['comments'], n_synonym_replacements))

    # 무작위 삽입
    augmented_sentences.append(random_insertion(row['comments'], n_random_insertions))

    # 무작위 삭제
    augmented_sentences.append(random_deletion(row['comments'], 0.2))  # 20% 확률로 단어 삭제

    # 무작위 교체
    augmented_sentences.append(random_swap(row['comments'], n_random_swaps))

    augmented_data = [{'comments': sentence, 'contain_gender': row['contain_gender'], 'bias': row['bias'], 'hate': row['hate']} for sentence in augmented_sentences]
    return augmented_data

# 부족한 클래스 조합의 데이터만 증강
augmented_data = []

for index, row in combinations_to_augment.iterrows():
    condition = (data['contain_gender'] == row['contain_gender']) & (data['bias'] == row['bias']) & (data['hate'] == row['hate'])
    class_data = data[condition]
    for idx, class_row in class_data.iterrows():
        augmented_data.extend(augment_data(class_row))  # 각 방법으로 증강

# 증강된 데이터프레임 생성
augmented_df = pd.DataFrame(augmented_data)

# 원본 데이터와 증강된 데이터 합치기
final_data = pd.concat([data, augmented_df])

# Tokenize and pad sequences again with the new augmented data
tokenizer.fit_on_texts(final_data['comments'])
sequences = tokenizer.texts_to_sequences(final_data['comments'])
X_augmented = pad_sequences(sequences, maxlen=maxlen)

# Update labels
y_augmented = final_data[['contain_gender', 'bias', 'hate']].values

print(f'Shape of augmented X: {X_augmented.shape}')
print(f'Shape of augmented y: {y_augmented.shape}')

# 증강 후 클래스 분포 확인
y_df_augmented = pd.DataFrame(y_augmented, columns=labels)
combination_distribution_augmented = y_df_augmented.groupby(labels).size().reset_index(name='count')
print("Class combination distribution after augmentation:")
print(combination_distribution_augmented)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Class combination distribution before augmentation:
   contain_gender  bias  hate  count
0               0     0     0   3273
1               0     0     1   1875
2               0     1     0    137
3               0     1     1   1379
4               1     1     0     76
5               1     1     1   1156
Shape of augmented X: (8748, 100)
Shape of augmented y: (8748, 3)
Class combination distribution after augmentation:
   contain_gender  bias  hate  count
0               0     0     0   3273
1               0     0     1   1875
2               0     1     0    685
3               0     1     1   1379
4               1     1     0    380
5               1     1     1   1156


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

model_02 = Sequential()
model_02.add(Embedding(max_words, 128, input_length=maxlen))
model_02.add(Conv1D(128, 5, activation='relu'))
model_02.add(GlobalMaxPooling1D())
model_02.add(Dense(128, activation='relu'))
model_02.add(Dropout(0.5))
model_02.add(Dense(len(labels), activation='sigmoid'))

model_02.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_02.summary()

Shape of X_train: (6998, 100)
Shape of y_train: (6998, 3)
Shape of X_test: (1750, 100)
Shape of y_test: (1750, 3)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          1280000   
                                                                 
 conv1d_2 (Conv1D)           (None, 96, 128)           82048     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense

In [None]:
model_02.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



# Train the model
history = model_02.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


성능이 어느정도 증가하였으나 kmhas 데이터셋에서와 마찬가지로 multi classification 문제에서는 1d-cnn이 충분한 성능을 내지 못할 것 같다.