## KDT_AI-classifying prejudice and discrimination texts
https://www.kaggle.com/competitions/kdtai-2/overview

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from torchinfo import summary

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt

import re
from konlpy.tag import Mecab

import sys
sys.path.append('..')

from python_scripts import data_setup, engine


In [2]:
idx_to_class = {
  0: 'Origin(출신차별)',
  1: 'Physical(외모차별) 외모(신체, 얼굴) 및 장애인 차별 발언을 포함합니다.',
  2: 'Politics(정치성향차별)',
  3: 'Profanity(혐오욕설) 욕설,저주,혐오 단어, 비속어 및 기타 혐오 발언을 포함합니다.',
  4: 'Age(연령차별)',
  5: 'Gender(성차별) 성별 또는 성적 취향에 대한 차별 발언을 포함합니다.',
  6: 'Not Hate Speech(해당사항없음)',
}

max_length = 200

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [4]:
train_data = pd.read_csv('../data/Discrimination/train.csv')
train_data

Unnamed: 0,ID,text,label
0,0,유소영비호감 성형아줌마,1
1,1,나오지마라 썅,3
2,2,식상하고 지긋지긋했는데 잘 끝나네 오예 소리벗고 빤스질러~~!!!,6
3,3,성희롱 당할 얼굴이 아닌데?ㅋㅋㅋ,5
4,4,"""끝까지 해보자~쪽파리 원숭이 자한 쓰레기당""",0
...,...,...,...
65858,65858,ㅋ ㅋ 쇼~~~ 도 적당히,6
65859,65859,"""이젠 전라도 종것들 음식 불매다. 태양광 중금속 환경오염물로 만든 식품 사절이다""",0
65860,65860,조센징들은 참 피곤하게 산다,0
65861,65861,"""문빠 다모아서 빨갱이한테보내고 행복하게살라고""",2


In [5]:
max([len(s) for s in train_data['text']])

306

In [8]:
def preprocess_korean_text(text):
    # Remove URLs and mentions <= 본 문제에서는 url 이나 email 주소가 나오지 않으니 그리 크게 중요하지 않을듯
    text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text)
    text = re.sub(r"@(\w+)", "", text)

    # Tokenize text using Mecab <= 이 부분은 다른 라이브러리로 바꾸어도 무방함
    mecab = Mecab()
    tokens = mecab.morphs(text)

    # Remove stop words (optional) <= 필요하다면 바꾸어 보아도 됨
    stop_words = ["은", "는", "이", "가", "을", "를", "에", "의", "로", "으로", "에서"]
    tokens = [t for t in tokens if t not in stop_words]

    # Remove punctuation and non-Korean characters <= 필요하다면 바꾸어 보아도 됨
    tokens = [re.sub(r"[^\u3131-\u3163\uac00-\ud7a3]+", "", t) for t in tokens]
    tokens = [t for t in tokens if t]

    return tokens

In [9]:
preprocess_korean_text('나는 지금 뭐하고 있느냐?')

['나', '지금', '뭐하고', '있느냐']

In [10]:
word_index_to_key = []
word_key_to_index = {}

for i in tqdm_notebook(range(len(train_data)), 'Making word maps'):
    text = train_data.iloc[i]['text']
    tokens = preprocess_korean_text(text)

    for token in tokens:
        if token not in word_key_to_index:
            word_key_to_index[token] = len(word_index_to_key)
            word_index_to_key.append(token)

word_key_to_index['<unk>'] = len(word_index_to_key)
word_index_to_key.append('<unk>')

Making word maps:   0%|          | 0/65863 [00:00<?, ?it/s]

In [11]:
len(word_index_to_key)

36746

In [12]:
class KoreanTextDataset(Dataset):
    def __init__(self, data, preprocess_korean_text, max_length=100):
        self.data = data
        self.max_length = max_length
        self.preprocess_korean_text = preprocess_korean_text
        self.idx_to_class = sorted(data['label'].unique())
        self.class_to_idx = {}
        for i in range(len(self.idx_to_class)):
            self.class_to_idx[self.idx_to_class[i]] = i
        self.class_names = self.idx_to_class

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, "text"]
        label = self.data.loc[index, "label"]

        # Preprocess text using the preprocess_korean_text() function
        tokens = self.preprocess_korean_text(text)
        # Truncate or pad tokens to a fixed length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens += [""] * (self.max_length - len(tokens))

        # Convert tokens to indices using the pre-trained GloVe or Word2Vec embeddings
        indices = []
        for token in tokens:
            if token in word_key_to_index:
                indices.append(word_key_to_index[token])
            else:
                indices.append(word_key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

        return torch.tensor(indices), torch.tensor(label)

In [13]:
train_dataset = KoreanTextDataset(
    data=train_data,
    preprocess_korean_text=preprocess_korean_text,
    max_length=max_length
)

train_dataset_sub, val_dataset_sub = data_setup.split_dataset(
    dataset=train_dataset,
    split_size=0.999,
    seed=42
)

[INFO] Splitting dataset of length 65863 into splits of size: 65797 and 66


In [14]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)

        repeated_hidden = hidden.unsqueeze(0).repeat(max_len, 1, 1)

        energy = torch.tanh(self.attn(torch.cat((repeated_hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=0).unsqueeze(2)

In [15]:
class RNN_LSTM_attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, LSTM_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        for param in self.embedding.parameters():
            param.requires_grad = False

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=LSTM_layers, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        # text = [batch size, seq len]

        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        # embedded = [batch size, seq len, emb dim]: [64, 200, 100]
        # print('embedded: ', embedded.shape)

        lstm_outputs, (hidden, _) = self.lstm(embedded.permute(1, 0, 2))
        # output = [batch size, seq len, hid dim * num directions]: [200, 64, 1024]
        # hidden/cell = [num layers * num directions, batch size, hid dim]: [6, 64, 512]
        # print('outputs, hidden: ', pre_lstm_outputs.shape, hidden.shape)

        h = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # [64, 1024]
        # print('h: ', h.shape)

        attention_weights = self.attention(h, lstm_outputs)
        # # attention_weights = [batch size, seq len, 1]: [200, 64, 1]
        # print('attention_weights: ', attention_weights.shape)

        context_vector = torch.bmm(lstm_outputs.permute(1, 2, 0), attention_weights.permute(1, 0, 2)).squeeze(2)
        # # context_vector = [batch size, hid dim * num directions]: [64, 1024]
        # print('context_vector: ', context_vector.shape)

        out = self.fc(self.dropout(context_vector.squeeze(0)))
        # out = [batch size, output dim]: [64, 7]
        # print('out: ', out.shape)

        return out


In [16]:
learning_rate_list = [1e-4] # 각 LR 별로 10 epoch 씩 연달아 학습 진행
weight_decay_list = [1e-4]
epochs_list = [1]
batch_size_list = [64]

In [17]:
class_names, num_classes = train_dataset.class_names, len(train_dataset.class_names)
class_names, num_classes

([0, 1, 2, 3, 4, 5, 6], 7)

In [18]:
model = RNN_LSTM_attention(
    vocab_size=len(word_index_to_key),
    embedding_dim=100,
    hidden_dim=512,
    output_dim=num_classes,
    LSTM_layers=2,
    dropout=0.2
)

In [19]:
summary(model)

Layer (type:depth-idx)                   Param #
RNN_LSTM_attention                       --
├─Embedding: 1-1                         (3,674,600)
├─LSTM: 1-2                              8,814,592
├─Dropout: 1-3                           --
├─Attention: 1-4                         --
│    └─Linear: 2-1                       2,098,176
│    └─Linear: 2-2                       1,024
├─Linear: 1-5                            7,175
Total params: 14,595,567
Trainable params: 10,920,967
Non-trainable params: 3,674,600

In [20]:
tuning_results = engine.HP_tune_train(
    model=model,
    model_generator=None,
    model_weights=None,
    model_name='LSTM_attention_discrimination',
    train_dataset=train_dataset_sub,
    test_dataset=val_dataset_sub,
    class_names=class_names,
    learning_rate_list=learning_rate_list,
    weight_decay_list=weight_decay_list,
    epochs_list=epochs_list,
    batch_size_list=batch_size_list,
    is_tensorboard_writer=False,
    device=device,
    gradient_accumulation_num=1,
    saving_max=False,
    metric_learning=False
)

LSTM_attention_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1:   0%|          | 0/1 [00:00<?, ?it/s]

train:   0%|          | 0/1029 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
torch.save(model.state_dict(), '../models/discrimination/LSTM_attention_discrimination_full_3.pth')

In [46]:
loaded_weight = torch.load('..\models\discrimination\LSTM_attention_discrimination_full_5.pth')
model.load_state_dict(loaded_weight)

<All keys matched successfully>

In [47]:
test_data = pd.read_csv('../data/Discrimination/test.csv')
labels = []

model.eval()

with torch.inference_mode():
  for i in tqdm_notebook(range(len(test_data))):
    test_text = test_data.loc[i, "text"]
    test_tokens = preprocess_korean_text(test_text)
    if len(test_tokens) > max_length:
        test_tokens = test_tokens[:max_length]
    else:
        test_tokens += [""] * (max_length - len(test_tokens))

    indices = []
    for token in test_tokens:
      if token in word_key_to_index:
        indices.append(word_key_to_index[token])
      else:
        indices.append(word_key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

    test_logits = model(torch.tensor(indices).unsqueeze(0).to(device))
    labels.append(class_names[torch.argmax(test_logits.squeeze(0).cpu())])

  0%|          | 0/13491 [00:00<?, ?it/s]

In [48]:
test_data['label'] = [idx_to_class[label] for label in labels]
test_data

Unnamed: 0,ID,text,label
0,0,"""솔직히 우리나라 청년들도 불쌍하고 아재들도 불쌍하고 노인들도 불쌍하다. 나라가 참""",Not Hate Speech(해당사항없음)
1,1,그만 보고싶네요 .늙은애들은.,Age(연령차별)
2,2,"""더러운 개신교벌레 새퀴""","Profanity(혐오욕설) 욕설,저주,혐오 단어, 비속어 및 기타 혐오 발언을 포..."
3,3,근데전태수씨 사망이유가뭔가요그어떤기사에도 나오질않네요,Not Hate Speech(해당사항없음)
4,4,"""태극기부대와 틀닭바퀴충들에게 순시리는 국모다. ㅉㅉ""",Politics(정치성향차별)
...,...,...,...
13486,13486,"""ㅅㅂ저년 한국인 아니라서 법적으로 어떻게 못하는거아님?OOO아 ㅅㅂ 느그나라로 꺼져""","Profanity(혐오욕설) 욕설,저주,혐오 단어, 비속어 및 기타 혐오 발언을 포..."
13487,13487,"""틀딱 택시기사 왔는가""",Age(연령차별)
13488,13488,"""틀딱들의 흔한 망상중 하나죠ㅋㅋ""",Age(연령차별)
13489,13489,"""빨갱이 새....끼 역시 잔인하노 능지처참은 중국놈인 니놈이고""",Politics(정치성향차별)


In [49]:
submission_data = pd.DataFrame({'ID': range(len(test_data)), 'label': labels})
submission_data.to_csv('../submissions/discrimination/submission_LSTM_attention_discrimination_full_5.csv', index=False)
print('submission completed!')
submission_data.head()

submission completed!


Unnamed: 0,ID,label
0,0,6
1,1,4
2,2,3
3,3,6
4,4,2
