In [1]:
!pip install transformers datasets torch
!pip install transformers accelerate

from datasets import load_dataset
from transformers import ElectraForTokenClassification, ElectraTokenizerFast, AdamW, get_scheduler
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss





In [2]:
from transformers import DataCollatorWithPadding

ds = load_dataset("humane-lab/K-HATERS")

tokenizer = ElectraTokenizerFast.from_pretrained("beomi/KcELECTRA-base-v2022")
# Tokenizer를 기반으로 DataCollator 생성 (다이나믹 패딩 적용)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 변경 방식 with BIO tag - try1
# def create_bio_labels(texts, rationales, tokenizer):
#     tokenized_texts = tokenizer(
#         texts, truncation=True, padding=True, return_offsets_mapping=True
#     )
#     labels = []

#     for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
#         # 모든 토큰에 'O'로 일단 초기화
#         token_labels = ["O"] * len(tokenized_texts["input_ids"][i])

#         # 유해 스팬과 오프셋 매핑 비교
#         for span in rationale_spans:
#             start, end = span  # 유해 스팬의 시작과 끝
#             for idx, (offset_start, offset_end) in enumerate(tokenized_texts["offset_mapping"][i]):
#                 if offset_start >= start and offset_end <= end:
#                     if token_labels[idx] == "O":
#                         token_labels[idx] = "B"
#                     else:
#                         token_labels[idx] = "I"

#         labels.append(token_labels)

#     # offset_mapping은 학습에 필요 없으므로 제거
#     tokenized_texts.pop("offset_mapping")
#     tokenized_texts["labels"] = labels
#     return tokenized_texts

# try 2
# def create_bio_labels(texts, rationales, tokenizer):
#     tokenized_texts = tokenizer(
#         texts, truncation=True, padding=True, return_offsets_mapping=True
#     )
#     labels = []

#     for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
#         # 모든 토큰에 'O'로 초기화
#         token_labels = ["O"] * len(tokenized_texts["input_ids"][i])
#         previous_tag = "O"  # 이전 토큰의 태그 상태를 저장

#         for span in rationale_spans:
#             start, end = span  # 유해 스팬의 시작과 끝
#             for idx, (offset_start, offset_end) in enumerate(tokenized_texts["offset_mapping"][i]):
#                 # Special tokens ([CLS], [SEP] 등)는 태깅하지 않음
#                 if offset_start == 0 and offset_end == 0:
#                     continue

#                 # 유해 스팬과 오프셋 비교
#                 if offset_start >= start and offset_end <= end:
#                     # B와 I 태깅 구분
#                     if previous_tag in ["O", "B"]:  # 이전에 'O'거나 새로운 시작이면 'B'
#                         token_labels[idx] = "B"
#                     else:
#                         token_labels[idx] = "I"
#                     previous_tag = token_labels[idx]
#                 else:
#                     previous_tag = "O"  # 범위 바깥이면 초기화

#         labels.append(token_labels)

#     # offset_mapping은 학습에 필요 없으므로 제거
#     tokenized_texts.pop("offset_mapping")
#     tokenized_texts["labels"] = labels
#     return tokenized_texts

# try3
def create_bio_labels(texts, rationales, tokenizer):
    tokenized_texts = tokenizer(
        texts, truncation=True, padding=True, return_offsets_mapping=True
    )
    labels = []

    for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
        # 모든 토큰에 'O'로 초기화
        token_labels = ["O"] * len(tokenized_texts["input_ids"][i])
        previous_tag = "O"  # 이전 태그 상태를 저장

        for span in rationale_spans:
            start, end = span  # 유해 스팬의 시작과 끝
            for idx, (offset_start, offset_end) in enumerate(tokenized_texts["offset_mapping"][i]):
                # Special tokens ([CLS], [SEP] 등)는 태깅하지 않음
                if offset_start == 0 and offset_end == 0:
                    continue

                # 유해 스팬과 오프셋 비교
                if offset_start >= start and offset_end <= end:
                    # B와 I 태깅 구분
                    if token_labels[idx] == "O":
                        if previous_tag == "O":  # 이전 태그가 'O'이면 'B' 태깅
                            token_labels[idx] = "B"
                        else:  # 이전 태그가 'B' 또는 'I'이면 'I' 태깅
                            token_labels[idx] = "I"
                    previous_tag = token_labels[idx]
                else:
                    previous_tag = "O"  # 범위 바깥이면 초기화

        labels.append(token_labels)

    # offset_mapping은 학습에 필요 없으므로 제거
    tokenized_texts.pop("offset_mapping")
    tokenized_texts["labels"] = labels
    return tokenized_texts




# BIO 레이블 데이터 생성
train_data = create_bio_labels(ds['train']['text'], ds['train']['offensiveness_rationale'], tokenizer)
validation_data = create_bio_labels(ds['validation']['text'], ds['validation']['offensiveness_rationale'], tokenizer)
test_data = create_bio_labels(ds['test']['text'], ds['test']['offensiveness_rationale'], tokenizer)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizerFast'.


In [3]:

bio_mapping = {'O': 0, 'B': 1, 'I': 2}

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor([bio_mapping[label] for label in self.encodings['labels'][idx]], dtype=torch.long)
        }

# # 학습 데이터셋 생성 to pytorch
train_dataset = CustomDataset(train_data)
validation_dataset = CustomDataset(validation_data)
test_dataset = CustomDataset(test_data)

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)  # 동적 패딩 적용
validation_loader = DataLoader(validation_dataset, batch_size=16, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=16)


In [4]:
# 라벨 확인해보자.
for i in range(1, 100, 3):
    print("Text:", ds['train']['text'][i])
    print("Labels:", train_data['labels'][i])


Text: 120시간 발언때도 마치 근로자들 120시간을 강제노역 시키자는 얘기로 왜곡하고 기사를 쓰더니~ 이번에도 무슨 서민들 불량식품 먹이자는 내용으로 왜곡하는구나~ 참~ 수준이~~
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

레이블이 잘 안 붙는다.

In [5]:
# 디버깅
# 오프셋과 토큰 확인
for i in range(3):
    tokenized = tokenizer(ds["train"]["text"][i], return_offsets_mapping=True)
    print(f"Text: {ds['train']['text'][i]}")
    print("Tokens and Offsets:")
    for token, offset in zip(
        tokenizer.convert_ids_to_tokens(tokenized["input_ids"]),
        tokenized["offset_mapping"]
    ):
        print(f"Token: {token}, Offset: {offset}")
    print("-" * 50)


Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Tokens and Offsets:
Token: [CLS], Offset: (0, 0)
Token: 하나도, Offset: (0, 3)
Token: 모르는, Offset: (4, 7)
Token: 얼, Offset: (8, 9)
Token: ##라, Offset: (9, 10)
Token: 쉭, Offset: (11, 12)
Token: 끼, Offset: (13, 14)
Token: ##가, Offset: (14, 15)
Token: 설치는, Offset: (16, 19)
Token: ##꼬, Offset: (19, 20)
Token: ##라서, Offset: (20, 22)
Token: ##니가, Offset: (22, 24)
Token: 무릇, Offset: (25, 27)
Token: 텅, Offset: (28, 29)
Token: ##빈, Offset: (29, 30)
Token: 백, Offset: (31, 32)
Token: 정한, Offset: (33, 35)
Token: ##테, Offset: (35, 36)
Token: 칼, Offset: (37, 38)
Token: ##자루, Offset: (38, 40)
Token: 쥐, Offset: (41, 42)
Token: ##여, Offset: (42, 43)
Token: ##준, Offset: (43, 44)
Token: 형국, Offset: (45, 47)
Token: 민, Offset: (48, 49)
Token: 좃, Offset: (50, 51)
Token: 당, Offset: (52, 53)
Token: 애, Offset: (54, 55)
Token: 세, Offset: (56, 57)
Token: ##들, Offset: (57, 58)
Token: 속은, Offset: (59, 61

In [6]:
# BIO 태깅 확인 함수
def debug_bio_tagging(texts, rationales, tokenizer, max_samples=5):
    """
    BIO 태깅 결과 디버깅 함수.
    - texts: 입력 텍스트 리스트
    - rationales: 유해성 스팬 정보
    """
    tokenized_texts = tokenizer(
        texts, truncation=True, padding=True, return_offsets_mapping=True
    )
    for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
        if i >= max_samples:  # 최대 샘플 수 초과 시 중단
            break
        print(f"Text: {text}")
        print("Tokens, Offsets, and BIO Labels:")

        token_labels = ["O"] * len(tokenized_texts["input_ids"][i])  # 초기화
        for span in rationale_spans:
            start, end = span
            for idx, (offset_start, offset_end) in enumerate(tokenized_texts["offset_mapping"][i]):
                if offset_start >= start and offset_end <= end:
                    if token_labels[idx] == "O":
                        token_labels[idx] = "B"
                    else:
                        token_labels[idx] = "I"

        tokens = tokenizer.convert_ids_to_tokens(tokenized_texts["input_ids"][i])
        offsets = tokenized_texts["offset_mapping"][i]
        for token, offset, label in zip(tokens, offsets, token_labels):
            print(f"Token: {token}, Offset: {offset}, Label: {label}")
        print("-" * 50)

# BIO 태깅 디버깅 실행
debug_bio_tagging(ds["train"]["text"], ds["train"]["offensiveness_rationale"], tokenizer)


Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Tokens, Offsets, and BIO Labels:
Token: [CLS], Offset: (0, 0), Label: O
Token: 하나도, Offset: (0, 3), Label: O
Token: 모르는, Offset: (4, 7), Label: O
Token: 얼, Offset: (8, 9), Label: B
Token: ##라, Offset: (9, 10), Label: B
Token: 쉭, Offset: (11, 12), Label: B
Token: 끼, Offset: (13, 14), Label: B
Token: ##가, Offset: (14, 15), Label: O
Token: 설치는, Offset: (16, 19), Label: O
Token: ##꼬, Offset: (19, 20), Label: O
Token: ##라서, Offset: (20, 22), Label: O
Token: ##니가, Offset: (22, 24), Label: O
Token: 무릇, Offset: (25, 27), Label: O
Token: 텅, Offset: (28, 29), Label: O
Token: ##빈, Offset: (29, 30), Label: O
Token: 백, Offset: (31, 32), Label: O
Token: 정한, Offset: (33, 35), Label: O
Token: ##테, Offset: (35, 36), Label: O
Token: 칼, Offset: (37, 38), Label: O
Token: ##자루, Offset: (38, 40), Label: O
Token: 쥐, Offset: (41, 42), Label: O
Token: ##여, Offset: (42, 43), Label: O
Token: ##준, Of

오프셋 정보는 문제가 없어보임.

In [7]:
import random

def random_check_bio_labels(dataset, tokenizer, processed_data, sample_size=5):
    for _ in range(sample_size):
        idx = random.randint(0, len(dataset["text"]) - 1)  # 랜덤 인덱스 선택
        text = dataset["text"][idx]
        rationale_spans = dataset["offensiveness_rationale"][idx]
        tokenized = tokenizer(text, truncation=True, padding=True, return_offsets_mapping=True)
        labels = processed_data["labels"][idx]

        print(f"Text: {text}")
        print("Tokens, Offsets, and BIO Labels:")
        for token, offset, label in zip(
            tokenizer.convert_ids_to_tokens(tokenized["input_ids"]),
            tokenized["offset_mapping"],
            labels,
        ):
            print(f"Token: {token}, Offset: {offset}, Label: {label}")
        print("-" * 50)


In [8]:
# 랜덤으로 샘플 5개 확인
random_check_bio_labels(
    dataset=ds["train"],
    tokenizer=tokenizer,
    processed_data=train_data,
    sample_size=5
)


Text: 청와대? 청개구리들이 와글거리는 대가리 치매걸린 것들 모인곳?
Tokens, Offsets, and BIO Labels:
Token: [CLS], Offset: (0, 0), Label: O
Token: 청와대, Offset: (0, 3), Label: O
Token: ?, Offset: (3, 4), Label: O
Token: 청, Offset: (5, 6), Label: O
Token: ##개, Offset: (6, 7), Label: O
Token: ##구리, Offset: (7, 9), Label: O
Token: ##들이, Offset: (9, 11), Label: O
Token: 와, Offset: (12, 13), Label: O
Token: ##글, Offset: (13, 14), Label: O
Token: ##거리는, Offset: (14, 17), Label: O
Token: 대가리, Offset: (18, 21), Label: B
Token: 치매, Offset: (22, 24), Label: B
Token: ##걸린, Offset: (24, 26), Label: O
Token: 것들, Offset: (27, 29), Label: O
Token: 모인, Offset: (30, 32), Label: O
Token: ##곳, Offset: (32, 33), Label: O
Token: ?, Offset: (33, 34), Label: O
Token: [SEP], Offset: (0, 0), Label: O
--------------------------------------------------
Text: 우리가 우리나라 지키는 훈련하겠다는데 이래라 저래라 혀바닥 나불대지마라~~ 자금성 정문에다가 똥싸버리기전에
Tokens, Offsets, and BIO Labels:
Token: [CLS], Offset: (0, 0), Label: O
Token: 우리가, Offset: (0, 3), Label: O
Token: 우리나라, Of

랜덤하게 인덱스 뽑아서 확인해보니 태깅이 잘 되어 있는 것으로 확인함.

In [9]:
from transformers import ElectraForTokenClassification

model_name = "beomi/KcELECTRA-base-v2022"
model = ElectraForTokenClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_bio",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs_bio',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [11]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.0089,0.016494
2,0.0075,0.016998
3,0.0056,0.02031


TrainOutput(global_step=32280, training_loss=0.007739237065356608, metrics={'train_runtime': 6878.0916, 'train_samples_per_second': 75.09, 'train_steps_per_second': 4.693, 'total_flos': 7.828398001624109e+16, 'train_loss': 0.007739237065356608, 'epoch': 3.0})

In [12]:
# save
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/MyDrive/yaife/detector_bio/saved_model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model & Tokenizer saved")

Mounted at /content/drive
Model & Tokenizer saved


In [13]:
def evaluate_bio_model(model, data_loader, tokenizer, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            labels = labels.cpu().numpy()

            all_preds.extend(preds.flatten())
            all_labels.extend(labels.flatten())

    from sklearn.metrics import classification_report
    print(classification_report(all_labels, all_preds, target_names=["O", "B", "I"]))

evaluate_bio_model(model, DataLoader(test_dataset, batch_size=16), tokenizer, device)


              precision    recall  f1-score   support

           O       1.00      1.00      1.00   1846465
           B       0.66      0.63      0.65      9066
           I       0.61      0.47      0.53      4469

    accuracy                           0.99   1860000
   macro avg       0.76      0.70      0.72   1860000
weighted avg       0.99      0.99      0.99   1860000



In [14]:
def extract_bio_spans_from_text(text, predictions, tokens, offsets):

    harmful_spans = []
    current_span = ""
    current_offsets = None

    for token, pred, (start, end) in zip(tokens, predictions, offsets):
        # 서브워드 접두어 제거
        if token.startswith("##"):
            token = token[2:]

        if pred == 1:  # B 태그
            # 현재 스팬이 존재하면 저장
            if current_span:
                harmful_spans.append((current_span, current_offsets))
            current_span = token
            current_offsets = (start, end)

        elif pred == 2 and current_span:  # I 태그
            # 스팬 이어붙이기
            current_span += token
            current_offsets = (current_offsets[0], end)

        else:  # O 태그
            # 현재 스팬 저장 후 초기화
            if current_span:
                harmful_spans.append((current_span, current_offsets))
                current_span = ""
                current_offsets = None

    # 마지막 스팬 저장
    if current_span:
        harmful_spans.append((current_span, current_offsets))

    return harmful_spans


In [15]:
# load
import torch
from google.colab import drive
drive.mount('/content/drive')

from transformers import ElectraForTokenClassification, ElectraTokenizerFast

load_path = "/content/drive/MyDrive/yaife/detector_bio/saved_model"

# 모델과 토크나이저 로드
model = ElectraForTokenClassification.from_pretrained(load_path)
tokenizer = ElectraTokenizerFast.from_pretrained(load_path)

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [21]:
def extract_bio_spans_from_text(text, predictions, tokens, offsets):

    harmful_spans = []
    current_span = ""
    current_offsets = None

    for token, pred, (start, end) in zip(tokens, predictions, offsets):
        # 서브워드 접두어 제거
        if token.startswith("##"):
            token = token[2:]

        if pred == 1:  # B 태그
            # 현재 스팬이 존재하면 저장
            if current_span:
                harmful_spans.append((current_span, current_offsets))
            current_span = token
            current_offsets = (start, end)

        elif pred == 2 and current_span:  # I 태그
            # 스팬 이어붙이기
            current_span += token
            current_offsets = (current_offsets[0], end)

        else:  # O 태그
            # 현재 스팬 저장 후 초기화
            if current_span:
                harmful_spans.append((current_span, current_offsets))
                current_span = ""
                current_offsets = None

    # 마지막 스팬 저장
    if current_span:
        harmful_spans.append((current_span, current_offsets))

    return harmful_spans


In [22]:
#test0
# 입력 텍스트
input_text = "너는 정말 재수없어. 한남이야."

# 텍스트 전처리
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    return_offsets_mapping=True
).to(device)

offsets = inputs.pop("offset_mapping")[0].tolist()  # Offset 추출
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # 토큰 리스트

# 모델 예측
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()  # BIO 태그 예측

# BIO 스팬 추출
harmful_spans = extract_bio_spans_from_text(input_text, predictions, tokens, offsets)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)


Input Text: 너는 정말 재수없어. 한남이야.
Harmful Spans: [('한남', (12, 14))]


In [23]:
#test1
# 입력 텍스트
input_text = "그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니"

# 텍스트 전처리
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    return_offsets_mapping=True
).to(device)

offsets = inputs.pop("offset_mapping")[0].tolist()  # Offset 추출
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # 토큰 리스트

# 모델 예측
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()  # BIO 태그 예측

# BIO 스팬 추출
harmful_spans = extract_bio_spans_from_text(input_text, predictions, tokens, offsets)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니
Harmful Spans: [('처', (4, 5)), ('빨', (5, 6)), ('개소리', (32, 35))]


개선된 메소드 정의 (유해 단어를 단일 스팬으로 묶기)

In [26]:
def extract_bio_spans_from_text2(text, predictions, tokens, offsets):
    spans = []
    current_span = None
    for token, pred, offset in zip(tokens, predictions, offsets):
        if pred == 1:  # "B" 태그
            if current_span:  # 이전 스팬을 저장
                spans.append(current_span)
            current_span = [token, offset]
        elif pred == 2 and current_span:  # "I" 태그
            current_span[0] += token.replace("##", "")  # 토큰 이어붙임
            current_span[1] = (current_span[1][0], offset[1])  # 범위 확장
        else:
            if current_span:  # 현재 스팬을 종료
                spans.append(current_span)
                current_span = None
    if current_span:
        spans.append(current_span)

    # 스팬을 텍스트와 범위로 변환
    return [(span[0], tuple(span[1])) for span in spans]


In [28]:
#test1
# 입력 텍스트
input_text = "그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니"

# 텍스트 전처리
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    return_offsets_mapping=True
).to(device)

offsets = inputs.pop("offset_mapping")[0].tolist()  # Offset 추출
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # 토큰 리스트

# 모델 예측
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()  # BIO 태그 예측

# BIO 스팬 추출
harmful_spans = extract_bio_spans_from_text2(input_text, predictions, tokens, offsets)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)

Input Text: 그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니
Harmful Spans: [('처', (4, 5)), ('##빨', (5, 6)), ('개소리', (32, 35))]


딱히 기대했던 효과는 아직 안 나타남..

In [29]:
# 테스트 함수 정의
def preprocess_and_predict(text, model, tokenizer, device):

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True
    ).to(device)

    offsets = inputs.pop("offset_mapping")[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()

    return tokens, offsets, predictions


In [30]:
# test
input_text = "그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니"

tokens, offsets, predictions = preprocess_and_predict(input_text, model, tokenizer, device)

harmful_spans = extract_bio_spans_from_text2(input_text, predictions, tokens, offsets)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)


Input Text: 그 때 처빨았던 과거의 행적이 욕을 먹는 건데 익1은 뭔 개소리하니
Harmful Spans: [('처', (4, 5)), ('##빨', (5, 6)), ('개소리', (32, 35))]
