## KDT_AI-classifying prejudice and discrimination texts
https://www.kaggle.com/competitions/kdtai-2/overview

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from torchinfo import summary

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt

import re
from konlpy.tag import Mecab

from gensim.models import word2vec

import sys
sys.path.append('..')

from python_scripts import data_setup, engine


In [2]:
idx_to_class = {
  0: 'Origin(출신차별)',
  1: 'Physical(외모차별) 외모(신체, 얼굴) 및 장애인 차별 발언을 포함합니다.',
  2: 'Politics(정치성향차별)',
  3: 'Profanity(혐오욕설) 욕설,저주,혐오 단어, 비속어 및 기타 혐오 발언을 포함합니다.',
  4: 'Age(연령차별)',
  5: 'Gender(성차별) 성별 또는 성적 취향에 대한 차별 발언을 포함합니다.',
  6: 'Not Hate Speech(해당사항없음)',
}

max_length = 200

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [4]:
train_data = pd.read_csv('../data/Discrimination/train.csv')
train_data

Unnamed: 0,ID,text,label
0,0,유소영비호감 성형아줌마,1
1,1,나오지마라 썅,3
2,2,식상하고 지긋지긋했는데 잘 끝나네 오예 소리벗고 빤스질러~~!!!,6
3,3,성희롱 당할 얼굴이 아닌데?ㅋㅋㅋ,5
4,4,"""끝까지 해보자~쪽파리 원숭이 자한 쓰레기당""",0
...,...,...,...
65858,65858,ㅋ ㅋ 쇼~~~ 도 적당히,6
65859,65859,"""이젠 전라도 종것들 음식 불매다. 태양광 중금속 환경오염물로 만든 식품 사절이다""",0
65860,65860,조센징들은 참 피곤하게 산다,0
65861,65861,"""문빠 다모아서 빨갱이한테보내고 행복하게살라고""",2


In [6]:
max([len(s) for s in train_data['text']])

306

In [7]:
def preprocess_korean_text(text):
    # Remove URLs and mentions
    text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text)
    text = re.sub(r"@(\w+)", "", text)

    # Tokenize text using Mecab
    mecab = Mecab('C:\mecab\mecab-ko-dic')
    tokens = mecab.morphs(text)

    # Remove stop words (optional)
    stop_words = ["은", "는", "이", "가", "을", "를", "에", "의", "로", "으로", "에서"]
    tokens = [t for t in tokens if t not in stop_words]

    # Remove punctuation and non-Korean characters
    tokens = [re.sub(r"[^\u3131-\u3163\uac00-\ud7a3]+", "", t) for t in tokens]
    tokens = [t for t in tokens if t]

    return tokens

In [8]:
preprocess_korean_text('나는 지금 뭐하고 있느냐?')

['나', '지금', '뭐', '하', '고', '있', '느냐']

In [11]:
# Getting Word2Vec embedding pre-trained model

w2v_pretrained_model = word2vec.Word2Vec.load('../data/Discrimination/word2vec')
w2v_pretrained_model.wv.add_vector('<unk>', [0.0] * 100)



358043

In [12]:
# Getting GloVe embedding pre-trained model

def load_glove_model(file):
    print("Loading Glove Weight")
    glove_vector = {}
    with open(file,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_vector[word] = embedding

    # 없는 단어 추가
    not_exists = pd.read_csv('../data/Discrimination/not_exist_list.csv')
    for i in range(len(not_exists)):
        glove_vector[not_exists.loc[i, 'word']] = glove_vector[not_exists.loc[i, 'substitute']]

    class Word_vector():
        def __init__(self, key_to_vector) -> None:
            self.key_to_vector = key_to_vector

            self.index_to_key = []
            self.key_to_index = {}
            for key in self.key_to_vector.keys():
                self.index_to_key.append(key)
                self.key_to_index[key] = len(self.index_to_key) - 1

            self.vectors = []
            for i in range(len(self.index_to_key)):
                self.vectors.append(self.key_to_vector[self.index_to_key[i]])
            self.vectors = np.array(self.vectors, dtype='float32')

            self.vector_size = len(self.vectors[0])

        def __contains__(self, key):
            return key in self.key_to_vector

        def __getitem__(self, key):
            return self.key_to_vector[key]

        def __len__(self):
            return len(self.index_to_key)

    class Glove_model():
        def __init__(self, vector) -> None:
            self.wv = Word_vector(vector)

        def __len__(self):
            return len(self.wv)

    glove_model = Glove_model(glove_vector)

    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_pretrained_model = load_glove_model('../data/Discrimination/glove.txt')
len(glove_pretrained_model.wv.index_to_key)

Loading Glove Weight
358085 words loaded!


358085

In [None]:
# pre-trained 임베딩 벡터에 등록되지 않은 단어 중 주요 단어를 골라 수작업 처리
# all = 0
# not_exists = {}
# not_exists_labels = {}

# for i in tqdm_notebook(range(len(train_data))):
#     sentence, label = train_data.loc[i, 'text'], train_data.loc[i, 'label']
#     for word in preprocess_korean_text(sentence):
#         if word not in glove_pretrained_model.wv:
#             if word in not_exists:
#                 not_exists[word] += 1
#             else:
#                 not_exists[word] = 1

#             if word not in not_exists_labels:
#                 not_exists_labels[word] = {n: 0 for n in range(7)}
#             not_exists_labels[word][label] += 1
#         all += 1

# not_exists = sorted(not_exists.items(), key=lambda x: x[1], reverse=True)
# not_labels = []
# for word, n in not_exists:
#     not_labels.append(sorted(not_exists_labels[word].items(), key=lambda x: x[1], reverse=True))
# print(all, len(not_exists))
# print(not_labels)

In [None]:
# not_exist_list = pd.DataFrame({'word': not_exists, 'label': not_labels})
# not_exist_list.to_csv('../data/Discrimination/not_exist_list.csv', index=False)

In [13]:
class KoreanTextDataset(Dataset):
    def __init__(self, data, embed_model, preprocess_korean_text, max_length=100):
        self.data = data
        self.max_length = max_length
        self.preprocess_korean_text = preprocess_korean_text
        self.model = embed_model
        self.idx_to_class = sorted(data['label'].unique())
        self.class_to_idx = {}
        for i in range(len(self.idx_to_class)):
            self.class_to_idx[self.idx_to_class[i]] = i
        self.class_names = self.idx_to_class

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, "text"]
        label = self.data.loc[index, "label"]

        # Preprocess text using the preprocess_korean_text() function
        tokens = self.preprocess_korean_text(text)
        # Truncate or pad tokens to a fixed length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens += [""] * (self.max_length - len(tokens))

        # Convert tokens to indices using the pre-trained GloVe or Word2Vec embeddings
        indices = []
        for token in tokens:
            if token in self.model.wv:
                indices.append(self.model.wv.key_to_index[token])
            else:
                indices.append(self.model.wv.key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

        return torch.tensor(indices), torch.tensor(label)

In [14]:
embed_model = w2v_pretrained_model

train_dataset = KoreanTextDataset(
    data=train_data,
    embed_model=embed_model,
    preprocess_korean_text=preprocess_korean_text,
    max_length=max_length
)

train_dataset_sub, val_dataset_sub = data_setup.split_dataset(
    dataset=train_dataset,
    split_size=0.9,
    seed=42
)

[INFO] Splitting dataset of length 65863 into splits of size: 59276 and 6587


In [15]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)

        repeated_hidden = hidden.unsqueeze(0).repeat(max_len, 1, 1)

        energy = torch.tanh(self.attn(torch.cat((repeated_hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=0).unsqueeze(2)

In [16]:
class RNN_LSTM_attention(nn.Module):
    def __init__(self, embedding_model, hidden_dim, output_dim, pre_LSTM_layers, post_LSTM_layers, dropout):
        super().__init__()

        vocab_size = len(embedding_model.wv.index_to_key)
        embedding_dim = embedding_model.wv.vector_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, _weight=torch.tensor(embedding_model.wv.vectors))
        for param in self.embedding.parameters():
            param.requires_grad = False

        self.pre_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=pre_LSTM_layers, bidirectional=True, dropout=dropout)
        self.post_lstm = nn.LSTM(hidden_dim * 2, hidden_dim, num_layers=post_LSTM_layers, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        # text = [batch size, seq len]

        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        # embedded = [batch size, seq len, emb dim]: [64, 200, 100]
        # print('embedded: ', embedded.shape)

        pre_lstm_outputs, (hidden, cell) = self.pre_lstm(embedded.permute(1, 0, 2))
        # output = [batch size, seq len, hid dim * num directions]: [200, 64, 1024]
        # hidden/cell = [num layers * num directions, batch size, hid dim]: [6, 64, 512]
        # print('outputs, hidden: ', pre_lstm_outputs.shape, hidden.shape)

        h = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # [64, 1024]
        # print('h: ', h.shape)

        attention_weights = self.attention(h, pre_lstm_outputs)
        # # attention_weights = [batch size, seq len, 1]: [200, 64, 1]
        # print('attention_weights: ', attention_weights.shape)

        context_vector = torch.bmm(pre_lstm_outputs.permute(1, 2, 0), attention_weights.permute(1, 0, 2)).squeeze(2)
        # # context_vector = [batch size, hid dim * num directions]: [64, 1024]
        # print('context_vector: ', context_vector.shape)

        _, (hidden, _) = self.post_lstm(context_vector.unsqueeze(0), (hidden, cell))
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden = [batch size, hid dim * num directions]: [64, 1024]
        # print('hidden: ', hidden.shape)

        out = self.fc(self.dropout(hidden.squeeze(0)))
        # out = [batch size, output dim]: [64, 7]
        # print('out: ', out.shape)

        return out


In [17]:
learning_rate_list = [1e-3, 1e-4, 1e-5] # 각 LR 별로 10 epoch 씩 연달아 학습 진행
weight_decay_list = [1e-4]
epochs_list = [5]
batch_size_list = [64]

In [18]:
class_names, num_classes = train_dataset.class_names, len(train_dataset.class_names)
class_names, num_classes

([0, 1, 2, 3, 4, 5, 6], 7)

In [19]:
model_w2v = RNN_LSTM_attention(
    embedding_model=w2v_pretrained_model,
    hidden_dim=512,
    output_dim=num_classes,
    pre_LSTM_layers=2,
    post_LSTM_layers=2,
    dropout=0.2
)

In [20]:
model_glove = RNN_LSTM_attention(
    embedding_model=glove_pretrained_model,
    hidden_dim=512,
    output_dim=num_classes,
    pre_LSTM_layers=2,
    post_LSTM_layers=2,
    dropout=0.2
)

In [21]:
summary(model_w2v), summary(model_glove)

 Layer (type:depth-idx)                   Param #
 RNN_LSTM_attention                       --
 ├─Embedding: 1-1                         (35,804,400)
 ├─LSTM: 1-2                              15,114,240
 ├─LSTM: 1-3                              18,898,944
 ├─Dropout: 1-4                           --
 ├─Attention: 1-5                         --
 │    └─Linear: 2-1                       2,098,176
 │    └─Linear: 2-2                       1,024
 ├─Linear: 1-6                            7,175
 Total params: 71,923,959
 Trainable params: 36,119,559
 Non-trainable params: 35,804,400
 Layer (type:depth-idx)                   Param #
 RNN_LSTM_attention                       --
 ├─Embedding: 1-1                         (35,808,500)
 ├─LSTM: 1-2                              15,114,240
 ├─LSTM: 1-3                              18,898,944
 ├─Dropout: 1-4                           --
 ├─Attention: 1-5                         --
 │    └─Linear: 2-1                       2,098,176
 │    └─Linear: 2-

In [22]:
model = model_w2v

tuning_results = engine.HP_tune_train(
    model=model,
    model_generator=None,
    model_weights=None,
    model_name='Two_LSTM_attention_w2v_discrimination',
    train_dataset=train_dataset_sub,
    test_dataset=val_dataset_sub,
    class_names=class_names,
    learning_rate_list=learning_rate_list,
    weight_decay_list=weight_decay_list,
    epochs_list=epochs_list,
    batch_size_list=batch_size_list,
    is_tensorboard_writer=False,
    device=device,
    gradient_accumulation_num=1,
    saving_max=True,
    metric_learning=False
)

Two_LSTM_attention_glove_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 0 | Train_loss: 1.2011, Train_acc: 0.5730 | Test_loss: 0.9428, Test_acc: 0.6725
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1_EPOCH_0_TEST-ACC_0.6725.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 1 | Train_loss: 0.9514, Train_acc: 0.6657 | Test_loss: 1.7333, Test_acc: 0.3874


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 2 | Train_loss: 1.5247, Train_acc: 0.4625 | Test_loss: 0.8467, Test_acc: 0.7065
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1_EPOCH_2_TEST-ACC_0.7065.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 3 | Train_loss: 0.8285, Train_acc: 0.7096 | Test_loss: 0.7483, Test_acc: 0.7382
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1_EPOCH_3_TEST-ACC_0.7382.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 4 | Train_loss: 0.7591, Train_acc: 0.7308 | Test_loss: 0.7274, Test_acc: 0.7405
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1_EPOCH_4_TEST-ACC_0.7405.pth


Two_LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 0 | Train_loss: 0.6798, Train_acc: 0.7591 | Test_loss: 0.6753, Test_acc: 0.7586
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1_EPOCH_0_TEST-ACC_0.7586.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 1 | Train_loss: 0.6574, Train_acc: 0.7643 | Test_loss: 0.6675, Test_acc: 0.7628
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1_EPOCH_1_TEST-ACC_0.7628.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 2 | Train_loss: 0.6480, Train_acc: 0.7695 | Test_loss: 0.6640, Test_acc: 0.7621


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 3 | Train_loss: 0.6416, Train_acc: 0.7704 | Test_loss: 0.6564, Test_acc: 0.7616


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 4 | Train_loss: 0.6334, Train_acc: 0.7738 | Test_loss: 0.6638, Test_acc: 0.7629
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1_EPOCH_4_TEST-ACC_0.7629.pth


Two_LSTM_attention_glove_discrimination_LR_1e-05_WD_0.0001_BS_64_GA_1:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 0 | Train_loss: 0.6154, Train_acc: 0.7789 | Test_loss: 0.6538, Test_acc: 0.7667
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_1e-05_WD_0.0001_BS_64_GA_1_EPOCH_0_TEST-ACC_0.7667.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 1 | Train_loss: 0.6150, Train_acc: 0.7784 | Test_loss: 0.6534, Test_acc: 0.7680
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_1e-05_WD_0.0001_BS_64_GA_1_EPOCH_1_TEST-ACC_0.7680.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 2 | Train_loss: 0.6107, Train_acc: 0.7801 | Test_loss: 0.6544, Test_acc: 0.7660


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 3 | Train_loss: 0.6107, Train_acc: 0.7815 | Test_loss: 0.6534, Test_acc: 0.7686
[INFO] Saving model to: ..\models\Two_LSTM_attention_glove_discrimination_LR_1e-05_WD_0.0001_BS_64_GA_1_EPOCH_3_TEST-ACC_0.7686.pth


train:   0%|          | 0/927 [00:00<?, ?it/s]

test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 4 | Train_loss: 0.6091, Train_acc: 0.7811 | Test_loss: 0.6518, Test_acc: 0.7682


In [None]:
# test_data = pd.read_csv('../data/Discrimination/test.csv')

# tt = []
# for i in tqdm_notebook(range(len(train_data))):
#     tt.append(len(preprocess_korean_text(train_data.iloc[i]['text'])))

# plt.boxplot(tt)
# print(max(tt), min(tt), np.mean(tt), np.var(tt))
# print(np.sum(np.array(tt) > 100), '/', len(tt))

In [None]:
loaded_weight = torch.load('..\models\discrimination\LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1_EPOCH_0_TEST-ACC_0.7751.pth')
model.load_state_dict(loaded_weight)

In [None]:
test_data = pd.read_csv('../data/Discrimination/test.csv')
labels = []

model.eval()

with torch.inference_mode():
  for i in tqdm_notebook(range(len(test_data))):
    test_text = test_data.loc[i, "text"]
    test_tokens = preprocess_korean_text(test_text)
    if len(test_tokens) > max_length:
        test_tokens = test_tokens[:max_length]
    else:
        test_tokens += [""] * (max_length - len(test_tokens))

    indices = []
    for token in test_tokens:
      if token in embed_model.wv:
        indices.append(embed_model.wv.key_to_index[token])
      else:
        indices.append(embed_model.wv.key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

    test_logits = model(torch.tensor(indices).unsqueeze(0).to(device))
    labels.append(class_names[torch.argmax(test_logits.squeeze(0).cpu())])

In [None]:
test_data['label'] = [idx_to_class[label] for label in labels]
test_data

In [None]:
submission_data = pd.DataFrame({'ID': range(len(test_data)), 'label': labels})
submission_data.to_csv('../submissions/discrimination/submission.csv', index=False)
print('submission completed!')
submission_data.head()