In [None]:
import pandas as pd

# CSV 파일 로드
df = pd.read_csv("darkpatterns.csv")

# 카테고리 → 프레이즈 함수 매핑
category_to_function = {
    'Social Proof': 'social_proof_phrase',
    'Misdirection': 'misdirection_phrase',
    'Urgency': 'urgency_phrase',
    'Forced Action': 'forced_action_phrase',
    'Obstruction': 'obstruction_phrase',
    'Sneaking': 'sneaking_phrase',
    'Scarcity': 'scarcity_phrase',
}

# 페이지 위치 → 스크린명 매핑
page_to_screen = {
    'Home Page': 'screen1',
    'Product Page': 'screen2',
    'Cart Page': 'screen3',
    'Checkout Process': 'screen4',
    'Other Page': 'screen5'
}

# samples 생성
samples = [
    (
        row['Pattern String'],
        f"contains_phrase({page_to_screen.get(row['Where in website?'], 'screenX')}, {category_to_function.get(row['Pattern Category'], 'unknown_phrase')})"
    )
    for _, row in df.iterrows()
]

# 예시 출력
for s in samples[-5:]:
    print(s)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

# 라벨 인코딩
# 텍스트와 라벨 분리
texts = [str(text) for text, _ in samples]
labels = [label for _, label in samples]

# 라벨 인코딩
label_encoder = LabelEncoder()
label_ids = label_encoder.fit_transform(labels)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset 클래스
class PredicateDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'token_type_ids': self.encodings['token_type_ids'][idx],
            'labels': self.labels[idx]
        }

# Train/Val split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, label_ids, test_size=0.2, random_state=42)
train_dataset = PredicateDataset(train_texts, train_labels)
val_dataset = PredicateDataset(val_texts, val_labels)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# 모델 정의
class PredicateClassifier(nn.Module):
    def __init__(self, num_classes):
        super(PredicateClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        cls_output = outputs.pooler_output
        logits = self.classifier(cls_output)
        return logits

# 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PredicateClassifier(num_classes=len(label_encoder.classes_)).to(device)

# 손실 함수와 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# 🔍 검증 평가 함수
def evaluate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            predictions = torch.argmax(outputs, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    return correct / total if total > 0 else 0

# 훈련 루프
for epoch in range(30):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    val_accuracy = evaluate_model(model, val_loader)
    print(f"✅ Epoch {epoch+1} completed - Validation Accuracy: {val_accuracy:.2f}")

# 예측 함수
def predict_predicate(text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        logits = model(**inputs)
        predicted = torch.argmax(logits, dim=1).cpu().numpy()[0]
        return label_encoder.inverse_transform([predicted])[0]

# 예측 예시
test_text = "Are you sure you want to cancel"
print("🧠 예측된 Predicate:", predict_predicate(test_text))
