In [None]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

def tokenize_data(train_x, train_y):
    input_ids = []
    attention_masks = []
    labels = []

    for x, y in zip(train_x, train_y):
        encoded_dict = tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(tokenizer.encode(y, add_special_tokens=True))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels


In [2]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(tokenizer.vocab),
)


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [None]:
import pandas as pd

df = pd.read_csv('merged.csv').dropna()

input_ids, attention_masks, labels = tokenize_data(df['Title'], df['Content'])


RuntimeError: torch.cat(): expected a non-empty list of Tensors

In [45]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW

# 훈련 데이터 로더 생성
batch_size = 32
train_data = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 훈련 루프
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_masks, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()



In [28]:
class TextPairDataset(Dataset):
    def __init__(self, X_texts, Y_texts):
        self.X_texts = X_texts
        self.Y_texts = Y_texts
    
    def __len__(self):
        return len(self.X_texts)
    
    def __getitem__(self, idx):
        return self.X_texts[idx], self.Y_texts[idx]

# Define the DataLoader
batch_size = 32
dataset = TextPairDataset(X_texts, Y_texts)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print(dataloader.__len__())


29


In [30]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2, keepdim=True)
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) + 
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

# Define the device, model, optimizer, and loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseKoBERT().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = ContrastiveLoss()



In [32]:
# Training loop
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    try:
        for x, y in dataloader:
        
            optimizer.zero_grad()
            x, y = x.to(device), y.to(device)
            output1, output2 = model(x, y)
            
            # Assuming labels are 1 for each pair since they are matching pairs
            labels = torch.ones(batch_size, 1).to(device)
            
            loss = criterion(output1, output2, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    except KeyError:
            print(f"Error at batch index: {i}")
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss/len(dataloader)}")


KeyError: 609