In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [2]:
filepath_train = os.getenv('HOME')+'/aiffel/dktc/data/train.csv'

train = pd.read_csv(filepath_train)

train.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [3]:
len(train)

3950

In [4]:
train.isnull().sum()

idx             0
class           0
conversation    0
dtype: int64

In [5]:
train['class'].drop_duplicates()

0          협박 대화
2      기타 괴롭힘 대화
3          갈취 대화
5    직장 내 괴롭힘 대화
Name: class, dtype: object

In [6]:
# 텍스트 클렌징 함수
def clean_text(text):
    # 불필요한 특수 문자, 숫자 제거 (한글, 영문, 공백 제외)
    text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z\s]", "", text)
    text = re.sub(r"\n", " ", text)
    return text

train['conversation'] = train['conversation'].apply(clean_text)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [8]:
#bert 모델, vocab 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /aiffel/aiffel/dktc/.cache/kobert_v1.zip
using cached model. /aiffel/aiffel/dktc/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [9]:
train.loc[(train['class'] == "협박 대화"), 'label'] = 0  # 협박 대화 => 0
train.loc[(train['class'] == "갈취 대화"), 'label'] = 1  # 갈취 대화 => 1
train.loc[(train['class'] == "직장 내 괴롭힘 대화"), 'label'] = 2  # 직장 내 괴롭힘 대화 => 2
train.loc[(train['class'] == "기타 괴롭힘 대화"), 'label'] = 3  # 기타 괴롭힘 대화 => 3
train.loc[(train['class'] == "일반 대화"), 'label'] = 4  # 일반 대화 => 4

data_list = []
for content, label in zip(train['conversation'], train['label'])  :
    temp = []
    temp.append(content)
    temp.append(str(int(label)))

    data_list.append(temp)

In [10]:
train.drop(columns = 'idx', inplace = True)

In [11]:
data_list[0]

['지금 너 스스로를 죽여달라고 애원하는 것인가  아닙니다 죄송합니다  죽을 거면 혼자 죽지 우리까지 사건에 휘말리게 해 진짜 죽여버리고 싶게  정말 잘못했습니다  너가 선택해 너가 죽을래 네 가족을 죽여줄까  죄송합니다 정말 잘못했습니다  너에게는 선택권이 없어 선택 못한다면 너와 네 가족까지 모조리 죽여버릴거야  선택 못하겠습니다 한번만 도와주세요  그냥 다 죽여버려야겠군 이의 없지  제발 도와주세요',
 '0']

In [12]:
dataset_train, dataset_test = train_test_split(data_list, test_size = 0.2, random_state = 42)

In [28]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        
        # 레이블이 없는 경우를 처리
        if label_idx is not None:
            self.labels = [np.int32(i[label_idx]) for i in dataset]
        else:
            self.labels = None

    def __getitem__(self, i):
        if self.labels is not None:
            return self.sentences[i] + (self.labels[i], )
        else:
            return self.sentences[i]

    def __len__(self):
        return len(self.sentences)

In [14]:
## Setting parameters
max_len = 130
batch_size = 32
warmup_ratio = 0.2
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [15]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
#BERTDataset 클래스 이용, TensorDataset으로 만들어주기
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model. /aiffel/aiffel/dktc/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [16]:
# 배치 및 데이터로더 설정 (num_workers 최적화)
# num_workers는 시스템에 맞춰 적절한 값을 설정 (예: CPU 코어 수에 맞춰 4나 8 정도로 설정)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=8, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=8, shuffle=False)



In [17]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=5,
                 dr_rate=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, segment_ids):
        # attention mask는 token_ids에서 패딩(0)이 아닌 부분을 1로 설정
        attention_mask = (token_ids != 0).float()
        
        # BERT 모델에서 pooler 출력 사용
        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))
        
        # 드롭아웃 적용
        if self.dr_rate:
            out = self.dropout(pooler)
        
        return self.classifier(out)

In [18]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [19]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [20]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [21]:
from sklearn.metrics import f1_score, precision_score, recall_score

# 성능 지표 계산 함수
def calc_metrics(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    pred_labels = max_indices.cpu().numpy()
    true_labels = Y.cpu().numpy()
    
    accuracy = (pred_labels == true_labels).sum() / len(true_labels)
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    
    return accuracy, f1, precision, recall

In [23]:
for e in range(num_epochs):
    # 각 에포크 시작 시 성능 지표 초기화
    train_acc = 0.0
    train_f1, train_precision, train_recall = 0.0, 0.0, 0.0  # 성능 지표 초기화
    test_acc, val_f1, val_precision, val_recall = 0.0, 0.0, 0.0, 0.0  # Validation 성능 지표 초기화
    
    model.train()
    
    for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        label = label.long().to(device)
        
        out = model(token_ids, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        
        # 성능 지표 계산
        batch_acc, batch_f1, batch_precision, batch_recall = calc_metrics(out, label)
        train_acc += batch_acc
        train_f1 += batch_f1
        train_precision += batch_precision
        train_recall += batch_recall
        
        if batch_id % log_interval == 0:
            print(f"Epoch {e+1} Batch {batch_id+1}: Loss {loss.data.cpu().numpy()}, Accuracy {train_acc/(batch_id+1)}, F1 {train_f1/(batch_id+1)}, Precision {train_precision/(batch_id+1)}, Recall {train_recall/(batch_id+1)}")
    
    print(f"Epoch {e+1}: Train Accuracy {train_acc/len(train_dataloader)}, F1 {train_f1/len(train_dataloader)}, Precision {train_precision/len(train_dataloader)}, Recall {train_recall/len(train_dataloader)}")
    
    # Validation 성능 계산
    model.eval()
    with torch.no_grad():
        for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            label = label.long().to(device)
            
            out = model(token_ids, segment_ids)
            batch_acc, batch_f1, batch_precision, batch_recall = calc_metrics(out, label)
            test_acc += batch_acc
            val_f1 += batch_f1
            val_precision += batch_precision
            val_recall += batch_recall
        
        print(f"Epoch {e+1}: Validation Accuracy {test_acc/len(test_dataloader)}, F1 {val_f1/len(test_dataloader)}, Precision {val_precision/len(test_dataloader)}, Recall {val_recall/len(test_dataloader)}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/99 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1 Batch 1: Loss 1.6723501682281494, Accuracy 0.25, F1 0.1846153846153846, Precision 0.1875, Recall 0.25


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Epoch 1: Train Accuracy 0.5083122895622896, F1 0.49024323397438996, Precision 0.5468266290820959, Recall 0.5083122895622896


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/25 [00:00<?, ?it/s]



Epoch 1: Validation Accuracy 0.7807954545454545, F1 0.7802289581237511, Precision 0.8244404935342434, Recall 0.7807954545454545


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/99 [00:00<?, ?it/s]



Epoch 2 Batch 1: Loss 0.5941647887229919, Accuracy 0.84375, F1 0.847885101010101, Precision 0.8797348484848485, Recall 0.84375
Epoch 2: Train Accuracy 0.829124579124579, F1 0.8287978409494354, Precision 0.8578396047633459, Recall 0.829124579124579


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/25 [00:00<?, ?it/s]



Epoch 2: Validation Accuracy 0.8695454545454545, F1 0.8679291340708641, Precision 0.8880900419025417, Recall 0.8695454545454545


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/99 [00:00<?, ?it/s]



Epoch 3 Batch 1: Loss 0.20815126597881317, Accuracy 0.9375, F1 0.9302083333333333, Precision 0.9488636363636364, Recall 0.9375
Epoch 3: Train Accuracy 0.9029882154882154, F1 0.9021810088746306, Precision 0.9155529459901303, Recall 0.9029882154882154


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/25 [00:00<?, ?it/s]



Epoch 3: Validation Accuracy 0.8727272727272727, F1 0.8729235794665641, Precision 0.8897269674769674, Recall 0.8727272727272727


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/99 [00:00<?, ?it/s]



Epoch 4 Batch 1: Loss 0.1419556438922882, Accuracy 0.96875, F1 0.969326923076923, Precision 0.9732142857142857, Recall 0.96875
Epoch 4: Train Accuracy 0.9531776094276094, F1 0.9532035098098544, Precision 0.9596055369177213, Recall 0.9531776094276094


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/25 [00:00<?, ?it/s]



Epoch 4: Validation Accuracy 0.8757954545454545, F1 0.87516361033855, Precision 0.8877358405483406, Recall 0.8757954545454545


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/99 [00:00<?, ?it/s]



Epoch 5 Batch 1: Loss 0.08512581884860992, Accuracy 0.9375, F1 0.9375, Precision 0.9375, Recall 0.9375
Epoch 5: Train Accuracy 0.9791666666666666, F1 0.9790149231653622, Precision 0.981278775762677, Recall 0.9791666666666666


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, _, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/25 [00:00<?, ?it/s]



Epoch 5: Validation Accuracy 0.8789772727272727, F1 0.8786155846184492, Precision 0.890780629093129, Recall 0.8789772727272727


In [32]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook

# test.json 파일 경로 설정
filepath_test = os.getenv('HOME')+'/aiffel/dktc/data/test.json'

# test.json 파일을 DataFrame으로 읽기 (pandas를 사용하는 경우)
test_data = pd.read_json(filepath_test)

# 각 열에 있는 'text' 데이터를 추출하여 하나의 리스트로 만듭니다.
test_list = []

for column in test_data.columns:
    conversation = test_data[column]['text']
    test_list.append([conversation])  # 각 대화를 리스트에 추가

# 텍스트 클렌징 함수 (train에서 사용한 것과 동일하게 적용)
def clean_text(text):
    text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z\s]", "", text)
    text = re.sub(r"\n", " ", text)
    return text

# 전처리 적용
test_list = [[clean_text(conversation[0])] for conversation in test_list]

# 토큰화 및 데이터셋 변환 (BERTDataset 클래스 및 토크나이저는 이미 정의되어 있다고 가정)
data_test = BERTDataset(test_list, 0, None, tok, max_len, True, False)

# 테스트 데이터 로더 생성
test_dataloader = DataLoader(data_test, batch_size=batch_size, num_workers=4)

# 모델 예측
model.eval()
predictions = []

with torch.no_grad():
    for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        
        # 모델 예측
        out = model(token_ids, segment_ids)
        pred = torch.argmax(out, dim=1).cpu().numpy()  # 가장 높은 확률의 클래스를 예측
        predictions.extend(pred)

# 숫자 레이블을 클래스명으로 변환하는 매핑
label_mapping = {
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4"
}

# 예측된 숫자 레이블을 클래스명으로 변환
predicted_classes = [label_mapping[pred] for pred in predictions]

# 예측 결과를 DataFrame으로 변환
submission = pd.DataFrame({
    'idx': test_data.columns,  # 각 대화에 해당하는 열 이름 (예: 't_000', 't_001' 등)
    'target': predicted_classes  # 예측된 클래스명
})

# 제출 파일로 저장
submission.to_csv('./mysubmission.csv', index=False)
print("Results saved to mysubmission.csv")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/16 [00:00<?, ?it/s]

Results saved to mysubmission.csv
