In [1]:
import os
import requests
import torch
import torch.nn.functional as F
from collections import OrderedDict
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark     = False

BASE_DIR         = "/home/kong/urlbert/url_bert/urlbert2"
TOKENIZER_DIR    = os.path.join(BASE_DIR, "bert_tokenizer")
CONFIG_DIR       = os.path.join(BASE_DIR, "bert_config")
CHECKPOINT_PATH  = os.path.join(BASE_DIR,"finetune/phishing/checkpoints/modelx_URLBERT_80.pth")  # or modelx_URLBERT_80.pth

In [3]:
def get_header_info(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    }
    try:
        # HEAD 요청 시도
        resp = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
        if resp.status_code == 405:
            resp = requests.get(url, headers=headers, timeout=5)
        h = resp.headers
        important = {k: h.get(k, "") for k in ("Server","Content-Type","Set-Cookie","Location","Date")}
        header_str = "; ".join(f"{k}: {v}" for k,v in important.items() if v)
        return header_str or "EMPTY"
    except Exception:
        return "NOHEADER"

In [4]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# ─── 2) 토크나이저·모델 로드 ─────────────────────────────────────
tokenizer = BertTokenizerFast.from_pretrained(TOKENIZER_DIR, local_files_only=True)

# 1) classification-ready config 생성
config = AutoConfig.from_pretrained(
    CONFIG_DIR,
    num_labels=2,
    vocab_size=5000
)

# 2) 분류 모델 + 저장된 head 통째로 로드
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=None,      # 로컬 모델만 사용
    config=config,
    state_dict=torch.load(CHECKPOINT_PATH, map_location="cpu")
)

model.to(DEVICE).eval()


  state_dict=torch.load(CHECKPOINT_PATH, map_location="cpu")
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at None and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(5000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [5]:
def classify_url(url: str):
    # a) 헤더 정보 붙이기
    header = get_header_info(url)
    text   = url + " [SEP] " + header

    # b) 토크나이즈
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids      = enc["input_ids"].to(DEVICE)
    attention_mask = enc["attention_mask"].to(DEVICE)

    # c) 모델 예측
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs  = F.softmax(logits, dim=-1).cpu().squeeze().tolist()

    # probs[0] = benign 확률, probs[1] = malicious 확률
    return {"benign": probs[0], "malicious": probs[1]}

In [6]:
if __name__ == "__main__":
    
    while True:
        url = input("\nURL을 입력하세요 (종료는 엔터만): ").strip()
        if not url:
            break
        scores = classify_url(url)
        print(f"→ 정상   (benign)    : {scores['benign']*100:6.2f}%")
        print(f"→ 악성   (malicious) : {scores['malicious']*100:6.2f}%")

→ 정상   (benign)    :  45.34%
→ 악성   (malicious) :  54.66%
→ 정상   (benign)    :  45.16%
→ 악성   (malicious) :  54.84%
