In [1]:
import regex as re
import random
import json
import pandas as pd
import numpy as np
import os

from itertools import combinations
from itertools import product

import torch
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW

from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
data_path = "/kaggle/input/vnlaw-llm-rag/data/"

MAX_LEN = 512
OVERLAP = 50

base_model = "FacebookAI/xlm-roberta-base"

files = os.listdir(data_path)
data_stored = []

In [3]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModel.from_pretrained(base_model)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [4]:
# khi split chunking không quá quan tâm đến độ dài
# mỗi chunk phải là một đoạn hoặc một câu hoàn chỉnh
# nếu là bảng thì phải tách riêng

def preprocess_phrase1_chunking(text):
    s = text
    # s = re.sub('CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM', '', s)
    # s = re.sub('Độc lập - Tự do - Hạnh phúc', '', s)
    s = re.sub('[- ]+', ' ', s)
    s = re.sub('[…]+', '…', s)
    # s = re.sub('')
    s = s.strip()
    return s

def count_tokens(text):
    return len(tokenizer.encode(text))

def last_words(text, n=10):
    lst = text.split(" ")[-n:]
    return " ".join(lst)

# Simple chunking

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=MAX_LEN,
    chunk_overlap=OVERLAP,
    length_function=count_tokens,
    is_separator_regex=False,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
)

In [5]:
# data_stored = []

# for file in files:
#     print(file)
#     field = []

#     with open(data_path + file, 'r', encoding='utf-8') as f:
#         data = json.load(f)

#     for i, item in enumerate(data):
#         name = item['name']
#         # print((name, len(item['content'])))
#         content = item['content']
#         content = "\n".join(content)
#         chunks = text_splitter.split_text(content)
#         field.append(chunks)
#         # if i == 10:
#         #     break
        
#     data_stored.append(field)

# import json
# with open('data_stored.json', 'w', encoding='utf-8') as f:
#     json.dump(data_stored, f, ensure_ascii=False, indent=4)

In [6]:
import json
with open('/kaggle/input/vnlaw-llm-rag-chunked/data_stored.json', 'r', encoding='utf-8') as f:
    data_stored = json.load(f)

# Pairing

In [7]:
def get_very_positive_pairs_chunks(data_stored, max_pairs= 500000):
    pairs = []
    for i in range(len(data_stored)):
        for j in range(len(data_stored[i])):
            chunks = data_stored[i][j]
            for k in range(len(chunks)):
                for l in range(k+1, len(chunks)):
                    pairs.append(((i,j,k), (i,j,l), 0.98))
    return random.sample(pairs, k=min(max_pairs, len(pairs)))

def get_low_positive_pairs_chunks(data_stored):
    pairs = []
    for i in range(len(data_stored)):
        l = len(data_stored[i])
        for j in range(l):
            for k in range(j+1, l):
                a = random.randrange(len(data_stored[i][j]))
                b = random.randrange(len(data_stored[i][k]))
                pairs.append(((i, j, a) , (i,k,b), 0.6))
    return pairs

def get_negative_pairs(data_stored, mean_per_field_pair=6746):
    pairs = []
    l = len(data_stored)
    for i in range(l):
        for j in range(i+1, l):
            indices1 = [(i, a, b) for a in range(len(data_stored[i])) for b in range(len(data_stored[i][a]))]
            indices2 = [(j, a, b) for a in range(len(data_stored[j])) for b in range(len(data_stored[j][a]))]

            a = np.random.choice(len(indices1), mean_per_field_pair)
            b = np.random.choice(len(indices2), mean_per_field_pair)

            for x, y in zip(a, b):
                pairs.append((indices1[x], indices2[y], 0))
    return pairs

In [9]:
pairs_doc = 0
for i in range(len(data_stored)):
    l = len(data_stored[i])
    pairs_doc += (l*(l-1))//2
pairs_doc

540439

In [10]:
very_positive_pairs = get_very_positive_pairs_chunks(data_stored, max_pairs=100000)
len(very_positive_pairs)

100000

In [11]:
low_positive_pairs = get_low_positive_pairs_chunks(data_stored)
len(low_positive_pairs)

540439

In [12]:
l = len(data_stored)
pairs_simularity_field_level = (l*(l-1))//2

mean_per_field_pair = len(low_positive_pairs) // pairs_simularity_field_level
mean_per_field_pair

6928

In [13]:
negative_pairs = get_negative_pairs(data_stored, mean_per_field_pair)
len(negative_pairs)

540384

# Training

In [16]:
# sentence1 = []
# sentence2 = []
# scores = []
# for pair in pairs:
#     (i, j, k), (x, y, z), score = pair
#     sentence1.append(data_stored[i][j][k])
#     sentence2.append(data_stored[x][y][z])
#     scores.append(score)

In [14]:
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        pair = self.pairs[idx]
        (i,j,k), (x,y,z), score = self.pairs[idx]
        sentence1 = data_stored[i][j][k]
        sentence2 = data_stored[x][y][z]
        token1 = tokenizer(sentence1, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        token2 = tokenizer(sentence2, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        return token1, token2, score


import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# loss_function = torch.nn.CosineEmbeddingLoss()
loss_function = torch.nn.MSELoss()

In [15]:
dataset = CustomDataset(very_positive_pairs + low_positive_pairs + negative_pairs)
dataloader = DataLoader(dataset, batch_size=20, shuffle=True)

In [24]:
for batch in dataloader:
    token1, token2, score = batch
    # outputs1 = model(token1['input_ids'].squeeze(1).to(device), token1['attention_mask'].squeeze(1).to(device))
    # outputs2 = model(token2['input_ids'].squeeze(1).to(device), token2['attention_mask'].squeeze(1).to(device))
    optimizer.zero_grad()
    outputs1 = model(token1['input_ids'].squeeze(1).to(device))
    outputs2 = model(token2['input_ids'].squeeze(1).to(device))
    similarity = torch.cosine_similarity(outputs1.pooler_output, outputs2.pooler_output)
    loss = loss_function(similarity, score.float().to(device))
    
    loss.backward()
    optimizer.step()

    print("-----")
    print(loss.item())

    show = np.stack((score.numpy(), similarity.detach().cpu().numpy()), axis=1)
    print(show)

-----
0.012937244959175587
[[ 0.6         0.55222321]
 [ 0.          0.1534567 ]
 [ 0.98        0.97582364]
 [ 0.6         0.41192156]
 [ 0.6         0.71191072]
 [ 0.          0.06501953]
 [ 0.         -0.05213815]
 [ 0.         -0.07870634]
 [ 0.6         0.73164392]
 [ 0.6         0.68700576]
 [ 0.6         0.60953546]
 [ 0.          0.02708413]
 [ 0.         -0.09442641]
 [ 0.98        0.73198652]
 [ 0.         -0.03936351]
 [ 0.6         0.58674079]
 [ 0.          0.11693544]
 [ 0.          0.21505739]
 [ 0.          0.07156838]
 [ 0.6         0.69452614]]
-----
0.005471714772284031
[[ 0.6         0.54938197]
 [ 0.6         0.45346615]
 [ 0.          0.13016409]
 [ 0.98        0.91003329]
 [ 0.6         0.56563079]
 [ 0.98        0.91658288]
 [ 0.         -0.00715538]
 [ 0.98        0.9559691 ]
 [ 0.          0.08679263]
 [ 0.          0.04601531]
 [ 0.          0.05985593]
 [ 0.6         0.56249571]
 [ 0.         -0.08658137]
 [ 0.6         0.53765059]
 [ 0.         -0.10308494]


KeyboardInterrupt: 

In [37]:
model.eval()
with torch.no_grad():
    outputs1 = model(token1['input_ids'].squeeze(1).to(device), token1['attention_mask'].squeeze(1).to(device))
    outputs2 = model(token2['input_ids'].squeeze(1).to(device), token2['attention_mask'].squeeze(1).to(device))
    # similarity = torch.nn.functional.cosine_similarity(outputs1['last_hidden_state'][:,0,:], outputs2['last_hidden_state'][:,0,:])
    loss = loss_function(outputs1.pooler_output, outputs2.pooler_output, score.float().to(device))
    print(loss.item())
    loss.backward()
    optimizer.step()
    

torch.Size([10, 768])

In [18]:
model.save_pretrained("trained_embedding_model",)

# Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

base_model_id = "FacebookAI/xlm-roberta-base"

In [None]:
model = AutoModel.from_pretrained('FacebookAI/xlm-roberta-base')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

In [None]:
model.eval()

In [None]:
for batch in train_loader:
    sentence1, sentence2, scores = batch
    token1 = tokenizer(sentence1, return_tensors='pt', padding=True)
    token2 = tokenizer(sentence2, return_tensors='pt', padding=True)
    break

In [None]:
with torch.no_grad():
    embed1 = model(**token1)
    embed2 = model(**token2)

In [None]:
sentence1[1]

In [None]:
len(token1['input_ids'][1])

In [None]:
tokenizer.decode(token1['input_ids'][1][503])

In [None]:
x = tokenizer.decode(token1['input_ids'][1])

In [None]:
len(x.split())

In [None]:
torch.cosine_similarity(embed1, embed2)