In [2]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


In [None]:
# 確保資料夾存在且可寫入
persist_directory = 'zh-tw-MGF_Profile'
# 使用 HuggingFace 嵌入
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# 加載文本並創建向量數據庫
loader = TextLoader('en-MFG/en-MGF_contents.txt')
documents = loader.load()
text_spliter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_chunks = text_spliter.split_documents(documents)

# 創建 Chroma 的儲存方式
vectordb = Chroma.from_documents(documents=text_chunks,
                                  embedding=embedding,  # 使用 embed_query 方法
                                  persist_directory=persist_directory)
vectordb.persist()
vectordb = None

In [7]:
###PDF
import fitz  # PyMuPDF
from langchain.vectorstores import Chroma
from langchain.schema import Document

persist_directory = 'zh-tw-MGF_Technology'
# 加载 PDF 文件并提取文本
def load_pdf(file_path):
    doc = fitz.open(file_path)
    text_chunks = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_chunks.append(page.get_text())
    return text_chunks

pdf_file_path = 'zh-tw-MGF_Technology_output.pdf'

text_chunks = load_pdf(pdf_file_path)

documents = [Document(page_content=chunk) for chunk in text_chunks]

# 創建 Chroma 的儲存方式
vectordb = Chroma.from_documents(documents=documents,
                                  embedding=embedding,  # 使用 embed_query 方法
                                  persist_directory=persist_directory)
vectordb.persist()
vectordb = None

In [5]:
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)
retriever = vectordb.as_retriever()

In [10]:
from transformers import BertModel, BertTokenizer
import torch

# 加載模型和分詞器
model_name = "/user_data/NLP_model/bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 打開並讀取文本文件
with open('en-MFG/en-MGF_contents.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# 將文本分割成較小的塊
max_length = 512
tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)

# 獲取嵌入
embeddings_list = []
for i in range(0, len(tokens['input_ids'][0]), max_length):
    input_chunk = {key: val[:, i:i + max_length] for key, val in tokens.items()}
    with torch.no_grad():
        outputs = model(**input_chunk)
    embeddings_list.append(outputs.last_hidden_state)

# 將所有嵌入塊拼接在一起
embeddings = torch.cat(embeddings_list, dim=1)

# 打印嵌入向量的形狀
print(embeddings.shape)

torch.Size([1, 512, 768])


In [18]:
sentence = 'I really enjoyed this movie a lot.'

tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']

T=15
padded_tokens = tokens + ['[PAD]' for _ in range(T-len(tokens))]
print("Padded tokens are \n {} ".format(padded_tokens))
attn_mask = [ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]
print("Attention Mask are \n {} ".format(attn_mask))
seg_ids = [0 for _ in range(len(padded_tokens))]
sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
token_ids = torch.tensor(sent_ids).unsqueeze(0) 
attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
seg_ids   = torch.tensor(seg_ids).unsqueeze(0)

print(token_ids)
print(attn_mask)
print(seg_ids)
output = model(token_ids, attention_mask=attn_mask,token_type_ids=seg_ids)
last_hidden_state, pooler_output = output[0], output[1]

print(last_hidden_state.shape) #hidden states of each token
print(pooler_output.shape) #hidden states of [cls] (forward one linear layer and Tanh activation)

Padded tokens are 
 ['[CLS]', 'I', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'] 
Attention Mask are 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] 
tensor([[ 101,  146, 1541, 4927, 1142, 2523,  170, 1974,  119,  102,    0,    0,
            0,    0,    0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
torch.Size([1, 15, 768])
torch.Size([1, 768])
