In [1]:
import pandas as pd
import numpy as np

In [2]:
# 데이터 로딩
training_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.csv')
validation_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/validation.csv')

In [3]:
# 데이터에 모든 라벨을 1로 설정
training_data['label'] = 1
validation_data['label'] = 1

In [4]:
# 데이터 확인
print(training_data.head())
print(validation_data.head())

              Origin Address     ex Add  \
0  서울특별시 양천구 신월동 남부순환로75길 35      밀레홈파크   
1       서울특별시 송파구 잠실동 잠실로 62  트리지움삼백십칠동   
2     서울특별시 구로구 온수동 경인로 15-7       성원연립   
3   서울특별시 은평구 녹번동 녹번로1길 19-9       주영빌트   
4      서울특별시 서초구 방배동 청두곶9길 4      덕산그린빌   

                                         Text Origin  label  
0            [밀레홈파크] 뒤쪽에 세탁소 아저씨 잘하세요 거기 정말 단골집이거든요.      1  
1             네 가까운 [트리지움317동] 앞 공영 주차장에 가시면 하나 있어요.      1  
2                 네 가까운 [성원연립] 앞 공영 주차장에 가시면 하나 있어요.      1  
3  안녕하세요 저희 주차장은 [주영빌트] 옆에 있습니다 다만 주차장이 협소한 관계로 만...      1  
4  택배기사입니다 오늘 오후에 택배 반품 수거하려고 하는데 위치가 [덕산그린빌] 앞이 ...      1  
              Origin Address     ex Add                          Text Origin  \
0  서울특별시 마포구 공덕동 마포대로 173-15    래미안공덕오차  [래미안공덕5차] 옆에 있는 마트 이번 주까지 특별 할인한대요.   
1  서울특별시 중랑구 중화동 상봉중앙로5나길 23     대명연립아동   [대명연립아동] 옆에 있는 마트 이번 주까지 특별 할인한대요.   
2    서울특별시 동작구 상도동 국사봉12길 36     상도첼시이차         새로 옮긴 회사가 [상도첼시2차] 근처라고 했었죠.   
3    서울특별시 은평구 구산동 연서로15길 30      만민하늘애    확실하진 않

In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
import torch

class AddressDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        ex_address = row['ex Add']
        origin_address = row['Origin Address']

        inputs = self.tokenizer.encode_plus(
            ex_address,
            origin_address,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(row['label'], dtype=torch.long)
        }

In [6]:
# 데이터셋 생성
MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
train_dataset = AddressDataset(training_data, tokenizer, MAX_LEN)
val_dataset = AddressDataset(validation_data, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

In [7]:
from transformers import AdamW
from tqdm import tqdm
import numpy as np

# Ko-BERT 모델과 토크나이저 불러오기
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2)

# 손실 함수와 옵티마이저 정의
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 학습 함수 정의
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [9]:
# 학습 루프
EPOCHS = 4
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = model.to(device)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        None,
        len(training_data)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_loader,
        loss_fn,
        device,
        len(validation_data)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

Epoch 1/4
----------


100%|██████████| 4840/4840 [11:17<00:00,  7.14it/s]


Train loss 0.0012275685970740777 accuracy 0.9999741692174564
Val   loss 3.457063030885568e-06 accuracy 1.0
Epoch 2/4
----------


100%|██████████| 4840/4840 [11:28<00:00,  7.03it/s]


Train loss 1.7366279781122347e-06 accuracy 1.0
Val   loss 2.384308798764699e-07 accuracy 1.0
Epoch 3/4
----------


100%|██████████| 4840/4840 [11:28<00:00,  7.03it/s]


Train loss 1.947757087345372e-07 accuracy 1.0
Val   loss 0.0 accuracy 1.0
Epoch 4/4
----------


100%|██████████| 4840/4840 [11:28<00:00,  7.03it/s]


Train loss 1.4497843538816759e-08 accuracy 1.0
Val   loss 0.0 accuracy 1.0


In [12]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return accuracy, precision, recall, f1

def print_metrics(model, data_loader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy, precision, recall, f1 = compute_metrics(all_labels, all_preds)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

# 예시로 평가 지표 출력 (학습 후 호출)
print_metrics(model, val_loader, device)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [13]:
def predict_address(ex_address):
    model.eval()
    inputs = tokenizer.encode_plus(
        ex_address,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

    _, preds = torch.max(outputs.logits, dim=1)
    return preds.item()

# 예시
ex_address_example = "대치2동"
predicted_origin_address = predict_address(ex_address_example)
print(f'Predicted Origin Address: {predicted_origin_address}')

Predicted Origin Address: 1
