In [None]:
# テキストコーパスをチャンクに分割
with open('kitei.txt', 'r', encoding='utf-8') as f:
    text = f.read()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(text)

In [None]:
print(type(texts))
print("--------")
print(len(texts))
print("--------")
print(texts[0])
print("--------")
print(texts[1])

In [None]:
# パッセージのベクトル化
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large', model_kwargs={'device': 'cpu'})

In [None]:
from langchain_community.vectorstores import FAISS

# データベースの保存
db = FAISS.from_texts(texts, embeddings)
db.save_local('kitei.db')

In [None]:
from langchain_community.vectorstores import FAISS

# 保存したデータベースの読み込み
db = FAISS.load_local('kitei.db',embeddings, allow_dangerous_deserialization=True)

In [None]:
similarity_sample = db.similarity_search("勤務")
print(len(similarity_sample))
print("--------")
print(type(similarity_sample[0]))
print("--------")
print(similarity_sample[0].page_content)

In [None]:
# 検索器の構築
retriever = db.as_retriever()   # 検索文書数 4（デフォルト）

In [None]:
# モデルの準備
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed

# モデルのロード
model_name = "sbintuitions/sarashina2.2-0.5b-instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.01,
    repetition_penalty=2.0,
    )
set_seed(123)

In [None]:
## # RAGを使わない方法
## 
## # ユーザーの入力
## user_input = [{"role": "user", "content": "勤務時間は何時から何時までですか？"}]
## 
## # モデルによる応答生成
## responses = chat_pipeline(user_input, max_length=50, do_sample=True, num_return_sequences=3,)
## 
## # 応答を表示
## for i, response in enumerate(responses, 1):
##     print(f"Response {i}: {response['generated_text']}")

In [None]:
# プロンプトの準備

template = """
ユーザー:以下のテキストを参照して、それに続く質問に答えてください。

{context}

{question}

システム:"""

from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
    template_format="f-string"
    )

In [None]:
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline

qa = RetrievalQA.from_chain_type(
    llm=HuggingFacePipeline(pipeline=pipe),
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
    verbose=True,
    )

In [None]:
# 実行例

q = "勤務時間は何時から何時までですか？"
ans = qa.invoke(q)
#print(ans['result'])
import re
pattern = re.compile(r'システム:(.*)',re.DOTALL)
match = pattern.search(ans['result'])
ans0 = match.group(1)
print(q)
print(ans0)

print("--------------------------")

q = "副業は可能ですか？"
ans = qa.invoke(q)
#print(ans['result'])
import re
pattern = re.compile(r'システム:(.*)',re.DOTALL)
match = pattern.search(ans['result'])
ans0 = match.group(1)
print(q)
print(ans0)

print("--------------------------")

q = "リモートワークは可能ですか？"
ans = qa.invoke(q)
#print(ans['result'])
import re
pattern = re.compile(r'システム:(.*)',re.DOTALL)
match = pattern.search(ans['result'])
ans0 = match.group(1)
print(q)
print(ans0)

print("--------------------------")

q = "出張での宿泊費はいくら出ますか？"
ans = qa.invoke(q)
#print(ans['result'])
import re
pattern = re.compile(r'システム:(.*)',re.DOTALL)
match = pattern.search(ans['result'])
ans0 = match.group(1)
print(q)
print(ans0)


In [None]:
print(ans['result'])