In [1]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from collections import Counter
import json
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import nltk.data
from annoy import AnnoyIndex
import sys

In [2]:
# package
import sys
sys.path.append("../")
 
from utils.tokenizer import Tokenize
from utils.vectorizer import Vectorize
from utils.processing import Preprocessing, check_profanity

# `Namespace 정의`

In [5]:
args = Namespace(
    raw_dataset_csv="../train.csv",
    preprocessed_csv="../preprocessed_train.csv",
    train_proportion=0.8,
    val_proportion=0.2,
    # 날짜와 경로 정보
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="../model_storage/toxicity_with_splits",
    # 모델 하이퍼파라미터
    embedding_size=400,
    max_seq_length = 1000,
    rnn_hidden_size=256,
    # 훈련 하이퍼파라미터
    seed=1337,
    num_epochs=5,#5,
    learning_rate=1e-3,
    batch_size=64,
    early_stopping_criteria=5,
    # 실행 옵션
    cuda='cuda',
    expand_filepaths_to_save_dir=True,
    vocab_max_length = 1000
)

# `Preprocessing`

In [6]:
preprocessing = Preprocessing(args.raw_dataset_csv)
toxicity_df = preprocessing.preprocess_comment()
toxicity_df.to_csv(args.preprocessed_csv, index=False)

# `Tokenizer(tokenize + vocabulary)`

In [7]:
comments = toxicity_df['comment'].values
tokenize = Tokenize()
tokenized_sents, vocab = tokenize.doc_tokenize(comments, train=True)
vocab_len = len(vocab)
vocab_len
# 22s

5871

# `Vectorize`

In [8]:
max_length = max(len(item) for item in tokenized_sents)
max_length

1405

In [9]:
vectorize = Vectorize(tokenized_sents, vocab, max_length)
out_vector, length = vectorize.vectorizer()

In [10]:
out_vector = np.array(out_vector)
out_vector.shape

(79785, 1405)

# `Dataset`

In [11]:
class dataset(Dataset):
    def __init__(self, out_vector, max_length, toxicity_df):
        self.out_vector = out_vector
        self.target = toxicity_df['toxicity']
        self.max_length = max_length
        
        self.x_data = self.out_vector
        self.y_target = self.target

        self.len = len(self.y_target)

    def __getitem__(self, index):
        x_data = self.x_data[index]
        y_target = self.y_target[index]

        x_data = torch.tensor(x_data)
        y_target = torch.tensor(y_target)

        ## 여기 수정해야함 </s>(idx:2) 이거 없는 코멘트는 맨 끝 인덱스로 - 수정함
        length = torch.where(x_data == 2)[0]
        if len(length) == 0:
            length = args.max_seq_length#torch.tensor([0])
        else:
            length = length.item()

        return {'x_data': x_data.to(args.cuda),
                'y_target': y_target.to(args.cuda),
                'x_length': length}
       

    def __len__(self):
        return self.len


In [12]:
class dataset(Dataset):
    def __init__(self, out_vector, max_length, toxicity_df, is_test=False):
        self.out_vector = out_vector
        self.max_length = max_length
        self.is_test = is_test
        
        if not is_test:
            self.target = toxicity_df['toxicity']
            self.y_target = torch.tensor(self.target)
        else:
            self.y_target = None

        self.x_data = torch.tensor(self.out_vector)
        self.len = len(self.x_data)

    def __getitem__(self, index):
        x_data = self.x_data[index]

        x_data = torch.tensor(x_data)

        length = torch.where(x_data == 2)[0]
        if len(length) == 0:
            length = self.max_length
        else:
            length = length.item()

        if self.is_test:
            return {'x_data': x_data.to(args.cuda),
                    'x_length': length}
        else:
            y_target = self.y_target[index]
            y_target = torch.tensor(y_target)
            return {'x_data': x_data.to(args.cuda),
                    'y_target': y_target.to(args.cuda),
                    'x_length': length}
       

    def __len__(self):
        return self.len


# `Model`

In [13]:
class SentimentRNN(nn.Module):
    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim, output_dim, bidirectional, drop_prob=0.5):
        super(SentimentRNN, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        self.bidirectional = bidirectional

        # 임베딩 및 LSTM 레이어
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=no_layers, batch_first=True, bidirectional=self.bidirectional)

        # 드롭아웃 레이어
        self.dropout = nn.Dropout(drop_prob)
        
        # 완전 연결 및 시그모이드 레이어
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    
    def attention_net(self, lstm_output, final_state): # lstm_output - (batch size, maxlen, hidden*2), final_state - (1, batch size, hidden*2)
        hidden = final_state.squeeze(0) # (batch size, hidden*2)
        attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2) # (batch size, maxlen)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2) # (batch size, hidden*2)

        return new_hidden_state
    
    def forward(self, x_in, hidden):
        batch_size = x_in.size(0)

        # 임베딩 및 LSTM 출력
        embeds = self.embedding(x_in)
        lstm_out, hidden = self.lstm(embeds, hidden)
        attn_output = self.attention_net(lstm_out, lstm_out.transpose(0, 1)[-1])

        # 드롭아웃과 완전 연결 레이어
        attn_output = attn_output.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(attn_output)
        out = self.fc(out)

        # 시그모이드 활성화 함수
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]  # 배치의 마지막 레이블 가져오기

        # 시그모이드 출력과 히든 상태 반환
        return sig_out, hidden

    def init_hidden(self, batch_size):
        # 히든 상태 초기화
        dim = self.no_layers
        if self.bidirectional :
            dim = self.no_layers * 2
            
        h0 = torch.zeros((dim, batch_size, self.hidden_dim)).to(args.cuda)
        c0 = torch.zeros((dim, batch_size, self.hidden_dim)).to(args.cuda)
        return h0, c0


# `Prepare`

In [14]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [15]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)

# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

In [16]:
# df = pd.read_csv('./data/toxicity_with_splits.csv')
df = toxicity_df

df_dataset = dataset(out_vector, max_length, df)

dataset_size = len(df_dataset)
train_size = int(dataset_size * args.train_proportion)
val_size = dataset_size - train_size

# train_dataset, validation_dataset = random_split(df_dataset, [train_size, val_size])

train_dataloader = DataLoader(dataset=df_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False)
# val_dataloader = DataLoader(dataset=validation_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False)

len(df_dataset)#, len(validation_dataset)

79785

In [17]:

# classifier = commentClassifier(embedding_size=args.embedding_size, 
#                                num_embeddings=len(vocab),
#                                num_classes=1,
#                                rnn_hidden_size=args.rnn_hidden_size,)

no_layers = 2
classifier = SentimentRNN(no_layers, vocab_len, args.rnn_hidden_size, args.embedding_size, bidirectional=True, output_dim=1).to(args.cuda)
classifier = classifier.to('cuda')

loss_func = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=2, eta_min=0)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                            mode='min', factor=0.5,
#                                            patience=1)

print(classifier)

SentimentRNN(
  (embedding): Embedding(5871, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


# `Train`

In [18]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score


epoch_train_loss = []
epoch_val_loss = []
epoch_f1_score = []
epoch_roc_auc_score = []

for epoch_index in range(args.num_epochs):

    # ----------------------------------------------train
    train_loss = []
    train_preds = []
    train_labels = []
    train_target_labels = []
    
    clip = 5
    classifier.train()

    batch_index = 0
    train_loop = tqdm(train_dataloader, leave=True)
    for batch_dict in train_loop:
        optimizer.zero_grad()
        
        h = classifier.init_hidden(batch_dict['x_data'].shape[0])
        y_pred, _ = classifier(x_in=batch_dict['x_data'], hidden=h)
        loss = loss_func(y_pred.squeeze(), batch_dict['y_target'].float())
        loss.backward()
        nn.utils.clip_grad_norm_(classifier.parameters(), clip)
        optimizer.step()
        # acc_t = accuracy_score(y_pred, batch_dict['y_target'])
        # acc_t = accuracy_score(y_pred.detach().cpu().numpy(), batch_dict['y_target'].detach().cpu().numpy())

        y_label = torch.round(y_pred).squeeze()
        y_target = batch_dict['y_target']
        
        train_loss.append(loss.item())
        train_preds.extend(y_pred.detach().cpu().numpy())
        train_labels.extend(y_label.detach().cpu().numpy())
        train_target_labels.extend(y_target.detach().cpu().numpy())

        batch_index += 1
        train_loop.set_postfix(batch_loss=loss.item())
        
    # -- train scoring --
    train_f1score = f1_score(train_labels, train_target_labels)
    train_roc_auc = roc_auc_score(train_labels, train_target_labels)
    
    
    print("train --- f1 score: {:.4f}, roc auc: {:.4f}, loss: {:.4f}".format(train_f1score, train_roc_auc, np.mean(train_loss)))
        
    # ----------------------------------------------valid
    # val_loss = []
    # val_preds = []
    # val_labels = []
    # val_target_labels = []
    
    # classifier.eval()

    # batch_index = 0
    # val_loop = tqdm(val_dataloader, leave=True)
    # for batch_dict in val_loop:
    #     optimizer.zero_grad()
        
    #     h = classifier.init_hidden(batch_dict['x_data'].shape[0])
    #     y_pred, _ = classifier(x_in=batch_dict['x_data'], hidden=h)
    #     loss = loss_func(y_pred.squeeze(), batch_dict['y_target'].float())

    #     y_label = torch.round(y_pred).squeeze()
    #     y_target = batch_dict['y_target']
        
    #     val_loss.append(loss.item())
    #     val_preds.extend(y_pred.detach().cpu().numpy())
    #     val_labels.extend(y_label.detach().cpu().numpy())
    #     val_target_labels.extend(y_target.detach().cpu().numpy())


    #     val_loop.set_postfix(batch_loss=loss.item())
    
    # # -- valid scoring --
    # val_f1score = f1_score(val_labels, val_target_labels)
    # val_roc_auc = roc_auc_score(val_labels, val_target_labels)
    
    # print("Epoch : ", epoch_index)
    # print("train --- f1 score: {:.4f}, roc auc: {:.4f}, loss: {:.4f}".format(train_f1score, train_roc_auc, np.mean(train_loss)))
    # print("valid --- f1 score: {:.4f}, roc auc: {:.4f}, loss: {:.4f}".format(val_f1score, val_roc_auc, np.mean(val_loss)))
    
    
    
    # epoch_train_loss.append(np.mean(train_loss))
    # epoch_val_loss.append(np.mean(val_loss))
    
    # epoch_f1_score.append(val_f1score)
    # epoch_roc_auc_score.append(val_roc_auc)

  x_data = torch.tensor(x_data)
  y_target = torch.tensor(y_target)
  3%|▎         | 42/1247 [00:09<04:29,  4.48it/s, batch_loss=0.441]


KeyboardInterrupt: 

# `Test inference`

In [None]:
# valid_path = '../2-Toxicity/validation.csv'
test_path = '../2-Toxicity/test_for_inference.csv'

# preprocessing
preprocessing = Preprocessing(test_path)
test_toxicity_df = preprocessing.preprocess_comment()
# toxicity_df.to_csv('../2-Toxicity/preprocessed_test.csv', index=False)

# tokenize
comments = test_toxicity_df['comment'].values
tokenize = Tokenize()
tokenized_sents, _ = tokenize.doc_tokenize(comments, train=True)
# vocab_len = len(_)
# vocab_len

In [None]:
# vectorize
max_length = max(len(item) for item in tokenized_sents)
vectorize = Vectorize(tokenized_sents, vocab, max_length)
out_vector, length = vectorize.vectorizer()
out_vector = np.array(out_vector)
out_vector.shape

(31915, 1343)

In [None]:
test_dataset = dataset(out_vector, max_length, test_toxicity_df, is_test=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False)

In [None]:
test_preds = []
test_labels = []
test_target_labels = []

classifier.eval()

batch_index = 0
test_loop = tqdm(test_dataloader, leave=True)
for batch_dict in test_loop:
    optimizer.zero_grad()
    
    h = classifier.init_hidden(batch_dict['x_data'].shape[0])
    y_pred, _ = classifier(x_in=batch_dict['x_data'], hidden=h)

    y_label = torch.round(y_pred).int().squeeze()
    test_preds.extend(y_pred.detach().cpu().numpy())
    test_labels.extend(y_label.detach().cpu().numpy())


    test_loop.set_postfix()

# -- valid scoring --
# val_f1score = f1_score(val_labels, val_target_labels)
# val_roc_auc = roc_auc_score(val_labels, val_target_labels)

# print("Epoch : ", epoch_index)
# print("valid --- f1 score: {:.4f}, roc auc: {:.4f}, loss: {:.4f}".format(val_f1score, val_roc_auc, np.mean(val_loss)))


  x_data = torch.tensor(x_data)
100%|██████████| 499/499 [00:33<00:00, 14.97it/s]


In [None]:
temp_preds = test_preds.copy()
temp_labels = test_labels.copy()

In [None]:
# count = 2 , p = 0.3

p = 0.3

for idx, comment in enumerate(comments) :
    if check_profanity(comment) :
        if (temp_preds[idx] + p) <=  1 :
            temp_preds[idx] += p
            
answer = np.round(temp_preds)

In [None]:
sample_submit = pd.read_csv('../2-Toxicity/sample_submission.csv')
sample_submit['probability'] = temp_preds
sample_submit['pred'] = temp_labels

In [None]:
sample_submit.to_csv('../2-Toxicity/submission.csv', index=False)