In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import BertTokenizer, BertModel
import faiss
import numpy as np


In [3]:
device = torch.device("cuda:0")
bertTokenizer = BertTokenizer.from_pretrained('monologg/kobert')
bertModel = BertModel.from_pretrained('monologg/kobert')

In [4]:

class MyKoBERT(nn.Module):
    def __init__(self):
        super(MyKoBERT, self).__init__()
        self.bert = bertModel
        self.tokenizer = bertTokenizer

        self.fc = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)

    def forward_one(self, x):
        inputs = self.tokenizer(x, return_tensors="pt", truncation=True, padding=True, max_length=128)
        outputs = self.bert(**inputs)
        return self.fc(outputs['last_hidden_state'][:, 0, :])

    def forward(self, x1, x2):
        out1 = self.forward_one(x1)
        out2 = self.forward_one(x2)
        return out1, out2

In [5]:
#parameters
MAX_LEN = 300
BATCH_SIZE = 64

NUM_EPOCHS = 10
LEARNING_RATE = 1e-5

In [14]:
import pandas as pd
file_path = '../crawling/file_content_list.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
df = df.dropna(subset=['Content'])
X_texts = df['Title']
Y_texts = df['Content']
print(X_texts.shape, Y_texts.shape)


(906,) (906,)


In [None]:
class TextPairDataset(Dataset):
    def __init__(self, X_texts, Y_texts):
        self.X_texts = X_texts
        self.Y_texts = Y_texts
    
    def __len__(self):
        return len(self.X_texts)
    
    def __getitem__(self, idx):
        print(idx)
        print(self.X_texts.shape, self.Y_texts.shape)
        return self.X_texts[idx], self.Y_texts[idx]

# Define the DataLoader
dataset = TextPairDataset(X_texts, Y_texts)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

print(dataloader.__len__())


15


In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        
        euclidean_distance = np.dot(output1, output2) / (np.linalg.norm(output1) * np.linalg.norm(output2))
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) + 
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

# Define the device, model, optimizer, and loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyKoBERT().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = ContrastiveLoss()



In [None]:
# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0

    # Embed Y texts

    #Y_vectors = [model.forward_one(y_text).detach().numpy() for y_text in Y_texts]
    #Y_vectors = np.vstack(Y_vectors)

    # Build the Faiss index
    #d = Y_vectors.shape[1]
    #index = faiss.IndexFlatL2(d)
    #index.add(Y_vectors)

   # try:
    for x, y in dataloader:
        print(x)
        print(y)
    
        optimizer.zero_grad()
        x, y = x.to(device), y.to(device)
        output1, output2 = model(x, y)
        
        # Assuming labels are 1 for each pair since they are matching pairs
        labels = torch.ones(batch_size, 1).to(device)
        
        loss = criterion(output1, output2, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    #except KeyError:
    #        print(f"Error at batch index: ")
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(dataloader)}")


KeyError: 698