In [None]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "3"
import json

p = 'data_title_content.json'
data = json.load(open(p))
# data

#### Create data chunks

In [2]:
from transformers import AutoTokenizer
from langchain_core.documents import Document
from tqdm import tqdm
import re


MAX_CHUNK_TOKENS = 512
OVERLAP_RATIO = 0.25
END_OF_SENTENCE = "([^。。\n\t\r!！？\?]+[。。\n\t\r!！？\?]*)"


tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    cache_dir='cache_dir'
    )
docs = []
postID = 0
dataID = 0
for d in tqdm(data['data']):
    seqNUM = 0
    
    title, content = d['title'], d['content']
    if type(content) != str:
        continue
        
    content = title + '\n' + content
    sentences = re.split(END_OF_SENTENCE, content)
    sentences = [s for s in sentences if s]
    content_tokens = tokenizer(sentences, add_special_tokens=False)['input_ids']
    
    chunk_tokens = []
    n_tokens = 0
    i = 0
    while i < len(content_tokens):
        chunk_tokens.append(content_tokens[i])
        n_tokens += len(content_tokens[i])
        
        if i + 1 == len(content_tokens) or n_tokens + len(content_tokens[i+1]) > MAX_CHUNK_TOKENS:
            # print(n_tokens)
            page_content = ''.join(tokenizer.batch_decode(chunk_tokens, skip_special_tokens=True))
            doc = Document(
                page_content=page_content, 
                metadata={'data_id': dataID, 'post_id': postID, 'seq_num': seqNUM}
            )
            docs.append(doc)
            dataID += 1
            seqNUM += 1
            
            if i + 1 < len(content_tokens):
                overlap_n_tokens = 0
                for tokens in chunk_tokens[::-1]:
                    
                    overlap_n_tokens += len(tokens)
                    if overlap_n_tokens > n_tokens * OVERLAP_RATIO:
                        break
                    else:
                        i -= 1
            
            chunk_tokens = []
            n_tokens = 0
            
        i += 1
            
    postID += 1

  from .autonotebook import tqdm as notebook_tqdm
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
100%|██████████| 50022/50022 [11:51<00:00, 70.33it/s] 


#### Create and save embeddings

In [224]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


def __load_model_embedding():
    return HuggingFaceEmbeddings(
            model_name = "BAAI/bge-m3",
            # model_kwargs = {"device": "cuda"},
            # 
            encode_kwargs = {'normalize_embeddings': True, 'batch_size': 1},
            # 
            cache_folder = 'cache_dir',    
        )
    
embeddings = __load_model_embedding()
db = FAISS.from_documents(docs, embeddings)

db.save_local("bge-chunk-512-overlap-25%-boundary") 

