In [2]:
!pip install transformers datasets torch
!pip install transformers accelerate

from datasets import load_dataset
from transformers import ElectraForTokenClassification, ElectraTokenizerFast, AdamW, get_scheduler
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss



Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
# 데이터셋 로드
ds = load_dataset("humane-lab/K-HATERS")

# 레이블 매핑
label_mapping = {
    "normal": 0,
    "offensive": 1,
    "L1_hate": 2,
    "L2_hate": 3
}

# 토크나이저 및 라벨 생성
tokenizer = ElectraTokenizerFast.from_pretrained("beomi/KcELECTRA-base-v2022")

# 기존 방식 0 or 1 tag
# 노멀:0, 유해:1로 처리하는 함수 정의
# def create_token_labels(texts, rationales, tokenizer):
#     tokenized_texts = tokenizer(texts, truncation=True, padding=True, return_offsets_mapping=True)
#     labels = []

#     for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
#         token_labels = [0] * len(tokenized_texts['input_ids'][i])
#         for span in rationale_spans:
#             start, end = span
#             for idx, (offset_start, offset_end) in enumerate(tokenized_texts['offset_mapping'][i]):
#                 if offset_start >= start and offset_end <= end:
#                     token_labels[idx] = 1
#         labels.append(token_labels)

#     tokenized_texts.pop('offset_mapping')
#     tokenized_texts['labels'] = labels
#     return tokenized_texts

# # 위 함수를 통해 학습 데이터를 split함.
# train_data = create_token_labels(ds['train']['text'], ds['train']['offensiveness_rationale'], tokenizer)
# validation_data = create_token_labels(ds['validation']['text'], ds['validation']['offensiveness_rationale'], tokenizer)
# test_data = create_token_labels(ds['test']['text'], ds['test']['offensiveness_rationale'], tokenizer)

# 변경 방식 with BIO tag
def create_bio_labels(texts, rationales, tokenizer):
    tokenized_texts = tokenizer(
        texts, truncation=True, padding=True, return_offsets_mapping=True
    )
    labels = []

    for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
        # 모든 토큰에 'O'로 초기화
        token_labels = ["O"] * len(tokenized_texts["input_ids"][i])

        # 유해 스팬과 오프셋 매핑 비교
        for span in rationale_spans:
            start, end = span  # 유해 스팬의 시작과 끝
            for idx, (offset_start, offset_end) in enumerate(tokenized_texts["offset_mapping"][i]):
                if offset_start >= start and offset_end <= end:
                    if token_labels[idx] == "O":
                        token_labels[idx] = "B"  # 스팬의 시작 토큰
                    else:
                        token_labels[idx] = "I"  # 스팬 내부의 토큰

        # BIO 라벨 추가
        labels.append(token_labels)

    # offset_mapping은 학습에 필요 없으므로 제거
    tokenized_texts.pop("offset_mapping")
    tokenized_texts["labels"] = labels
    return tokenized_texts


# BIO 레이블 데이터 생성
train_data = create_bio_labels(ds['train']['text'], ds['train']['offensiveness_rationale'], tokenizer)
validation_data = create_bio_labels(ds['validation']['text'], ds['validation']['offensiveness_rationale'], tokenizer)
test_data = create_bio_labels(ds['test']['text'], ds['test']['offensiveness_rationale'], tokenizer)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.93k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

val.jsonl:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/172158 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizerFast'.


In [4]:
#pytorch 데이터로 변환하는 함수 정의

# class CustomDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __len__(self):
#         return len(self.encodings['input_ids'])

#     def __getitem__(self, idx):
#         return {
#             'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
#             'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
#             'labels': torch.tensor(self.encodings['labels'][idx], dtype=torch.long)
#         }

bio_mapping = {'O': 0, 'B': 1, 'I': 2}

class CustomBIO_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor([bio_mapping[label] for label in self.encodings['labels'][idx]], dtype=torch.long)
        }

# 데이터셋 변환
train_dataset = CustomBIO_Dataset(train_data)
validation_dataset = CustomBIO_Dataset(validation_data)
test_dataset = CustomBIO_Dataset(test_data)


# # 학습 데이터셋 생성 to pytorch
# train_dataset = CustomDataset(train_data)
# validation_dataset = CustomDataset(validation_data)
# test_dataset = CustomDataset(test_data)

# # 데이터 로더 생성
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# validation_loader = DataLoader(validation_dataset, batch_size=16)
# test_loader = DataLoader(test_dataset, batch_size=16)


In [5]:
# 라벨 확인해보자.
for i in range(5):
    print("Text:", ds['train']['text'][i])
    print("Labels:", train_data['labels'][i])


Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Labels: ['O', 'O', 'O', 'B', 'B', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

레이블이 잘 안 붙는다.

In [6]:
# 디버깅
# 오프셋과 토큰 확인
for i in range(3):
    tokenized = tokenizer(ds["train"]["text"][i], return_offsets_mapping=True)
    print(f"Text: {ds['train']['text'][i]}")
    print("Tokens and Offsets:")
    for token, offset in zip(
        tokenizer.convert_ids_to_tokens(tokenized["input_ids"]),
        tokenized["offset_mapping"]
    ):
        print(f"Token: {token}, Offset: {offset}")
    print("-" * 50)


Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Tokens and Offsets:
Token: [CLS], Offset: (0, 0)
Token: 하나도, Offset: (0, 3)
Token: 모르는, Offset: (4, 7)
Token: 얼, Offset: (8, 9)
Token: ##라, Offset: (9, 10)
Token: 쉭, Offset: (11, 12)
Token: 끼, Offset: (13, 14)
Token: ##가, Offset: (14, 15)
Token: 설치는, Offset: (16, 19)
Token: ##꼬, Offset: (19, 20)
Token: ##라서, Offset: (20, 22)
Token: ##니가, Offset: (22, 24)
Token: 무릇, Offset: (25, 27)
Token: 텅, Offset: (28, 29)
Token: ##빈, Offset: (29, 30)
Token: 백, Offset: (31, 32)
Token: 정한, Offset: (33, 35)
Token: ##테, Offset: (35, 36)
Token: 칼, Offset: (37, 38)
Token: ##자루, Offset: (38, 40)
Token: 쥐, Offset: (41, 42)
Token: ##여, Offset: (42, 43)
Token: ##준, Offset: (43, 44)
Token: 형국, Offset: (45, 47)
Token: 민, Offset: (48, 49)
Token: 좃, Offset: (50, 51)
Token: 당, Offset: (52, 53)
Token: 애, Offset: (54, 55)
Token: 세, Offset: (56, 57)
Token: ##들, Offset: (57, 58)
Token: 속은, Offset: (59, 61

오프셋 정보는 문제가 없어보임.

In [7]:
import random

def random_check_bio_labels(dataset, tokenizer, processed_data, sample_size=5):
    for _ in range(sample_size):
        idx = random.randint(0, len(dataset["text"]) - 1)  # 랜덤 인덱스 선택
        text = dataset["text"][idx]
        rationale_spans = dataset["offensiveness_rationale"][idx]
        tokenized = tokenizer(text, truncation=True, padding=True, return_offsets_mapping=True)
        labels = processed_data["labels"][idx]

        print(f"Text: {text}")
        print("Tokens, Offsets, and BIO Labels:")
        for token, offset, label in zip(
            tokenizer.convert_ids_to_tokens(tokenized["input_ids"]),
            tokenized["offset_mapping"],
            labels,
        ):
            print(f"Token: {token}, Offset: {offset}, Label: {label}")
        print("-" * 50)


In [32]:
# 랜덤으로 샘플 5개 확인
random_check_bio_labels(
    dataset=ds["train"],
    tokenizer=tokenizer,
    processed_data=train_data,
    sample_size=5
)


Randomly Checking BIO Tagging...
Text: 선진국은 원래 저런다니까. 개돼지들 천국인 민주당 페미 지지국 한국은 아직 개도국이야 국민수준이
Tokens, Offsets, and BIO Labels:
Token: [CLS], Offset: (0, 0), Label: O
Token: 선진국, Offset: (0, 3), Label: O
Token: ##은, Offset: (3, 4), Label: O
Token: 원래, Offset: (5, 7), Label: O
Token: 저런, Offset: (8, 10), Label: O
Token: ##다니, Offset: (10, 12), Label: O
Token: ##까, Offset: (12, 13), Label: O
Token: ., Offset: (13, 14), Label: O
Token: 개돼지들, Offset: (15, 19), Label: O
Token: 천국, Offset: (20, 22), Label: O
Token: ##인, Offset: (22, 23), Label: O
Token: 민주당, Offset: (24, 27), Label: O
Token: 페미, Offset: (28, 30), Label: O
Token: 지지, Offset: (31, 33), Label: O
Token: ##국, Offset: (33, 34), Label: O
Token: 한국은, Offset: (35, 38), Label: O
Token: 아직, Offset: (39, 41), Label: O
Token: 개도국, Offset: (42, 45), Label: B
Token: ##이야, Offset: (45, 47), Label: O
Token: 국민, Offset: (48, 50), Label: O
Token: ##수준이, Offset: (50, 53), Label: O
Token: [SEP], Offset: (0, 0), Label: O
---------------------------

랜덤하게 인덱스 뽑아서 확인해보니 태깅이 잘 되어 있는 것으로 확인함.

In [15]:
from transformers import ElectraForTokenClassification

model_name = "beomi/KcELECTRA-base-v2022"
model = ElectraForTokenClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_bio",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs_bio',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [11]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.3377,0.331917


Epoch,Training Loss,Validation Loss
1,0.3377,0.331917
2,0.3266,0.343825
3,0.3341,0.357677


TrainOutput(global_step=32280, training_loss=0.3305576454455672, metrics={'train_runtime': 7092.6993, 'train_samples_per_second': 72.818, 'train_steps_per_second': 4.551, 'total_flos': 7.828398001624109e+16, 'train_loss': 0.3305576454455672, 'epoch': 3.0})

In [13]:
# save
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/MyDrive/yaife/detector_bio/saved_model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model & Tokenizer saved")

Mounted at /content/drive
Model & Tokenizer saved


In [16]:
def evaluate_bio_model(model, data_loader, tokenizer, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            labels = labels.cpu().numpy()

            all_preds.extend(preds.flatten())
            all_labels.extend(labels.flatten())

    from sklearn.metrics import classification_report
    print(classification_report(all_labels, all_preds, target_names=["O", "B", "I"]))

evaluate_bio_model(model, DataLoader(test_dataset, batch_size=16), tokenizer, device)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           O       0.91      0.18      0.30   1680962
           B       0.08      0.12      0.10    179038
           I       0.00      0.00      0.00         0

    accuracy                           0.17   1860000
   macro avg       0.33      0.10      0.13   1860000
weighted avg       0.83      0.17      0.28   1860000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
def extract_bio_spans_from_text(text, model, tokenizer, device):

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True
    ).to(device)
    offset_mapping = inputs.pop("offset_mapping")[0]

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().cpu().numpy())
    harmful_spans = []

    #BIO 태그 후처리
    current_span = ""
    current_offsets = None

    for token, pred, (start, end) in zip(tokens, predictions, offset_mapping.tolist()):
        # Special tokens 무시
        if token in tokenizer.all_special_tokens:
            continue

        # B 태그: 새로운 스팬 시작
        if pred == 1:  # B
            if current_span:  # 이전 스팬 저장
                harmful_spans.append((current_span, current_offsets))
            current_span = token.lstrip("##")
            current_offsets = (start, end)

        # I 태그: 이전 스팬에 추가
        elif pred == 2 and current_span:  # I
            current_span += token.lstrip("##")
            current_offsets = (current_offsets[0], end)

        # O 태그: 스팬 종료
        else:
            if current_span:
                harmful_spans.append((current_span, current_offsets))
                current_span = ""
                current_offsets = None

    # 남은 스팬 저장
    if current_span:
        harmful_spans.append((current_span, current_offsets))

    # 최종 결과
    return [(text[start:end], (start, end)) for current_span, (start, end) in harmful_spans]


In [33]:
# test 1
input_text = "너는 정말 재수없어. 한남이야."

harmful_spans = extract_bio_spans_from_text(input_text, model, tokenizer, device)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 너는 정말 재수없어. 한남이야.
Harmful Spans: [('너는', (0, 2)), ('정말 재수없어', (3, 10)), ('.', (10, 11)), ('이야', (14, 16))]


In [34]:
# test 2
input_text = "너는 정말 최악이야. 무식한 행동 좀 하지마."

harmful_spans = extract_bio_spans_from_text(input_text, model, tokenizer, device)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 너는 정말 최악이야. 무식한 행동 좀 하지마.
Harmful Spans: [('너는', (0, 2)), ('정말', (3, 5)), ('최악', (6, 8)), ('이야.', (8, 11))]


In [35]:
# test3
input_text = "너 한남이야? 꼴깝떨고 있네 "

harmful_spans = extract_bio_spans_from_text(input_text, model, tokenizer, device)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 너 한남이야? 꼴깝떨고 있네 
Harmful Spans: [('너', (0, 1)), ('? 꼴깝떨고 있네', (6, 15))]


In [36]:
# test4
input_text = "너 페미야? 꼴깝떨고있네 "

harmful_spans = extract_bio_spans_from_text(input_text, model, tokenizer, device)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 너 페미야? 꼴깝떨고있네 
Harmful Spans: [('떨고', (9, 11)), ('있네', (11, 13))]
