In [1]:
import os

with open('api_key.txt', 'r') as f:
    key = f.read()

os.environ['GIGACHAT_API_ACCESS_KEY'] = key

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter 
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_paths = [
    'docs/test.json',
    'docs/raw_recomm_papers_1.txt',
    'docs/raw_recomm_papers_2.txt',
    'docs/raw_recomm_papers_3.txt'
]

# Load each file and store the results in a list
documents = []
for file_path in file_paths:
    loader = TextLoader(file_path, encoding='utf-8')  # Specify the encoding
    documents.extend(loader.load())


text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

db = Chroma.from_documents(docs, embedding_function)
db

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


<langchain_community.vectorstores.chroma.Chroma at 0x24bf4d84c80>

In [3]:
documents

[Document(metadata={'source': 'docs/test.json'}, page_content='[\n    {\n        "–Ω–∞–∑–≤–∞–Ω–∏–µ": "Acer Aspire 3 A317-51G",\n        "—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1600x900",\n        "–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä": "Intel Core i5 1.6 –ì–ì—Ü",\n        "–æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–∞—è_–ø–∞–º—è—Ç—å": "8 –ì–ë",\n        "–Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å": "SSD 256 –ì–ë",\n        "–≤–∏–¥–µ–æ–∫–∞—Ä—Ç–∞": "GeForce MX230",\n        "–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω–∞—è_—Å–∏—Å—Ç–µ–º–∞": "Linux",\n        "—Ü–µ–Ω–∞": "57000 ‚ÇΩ"\n    },\n    {\n      "–Ω–∞–∑–≤–∞–Ω–∏–µ": "ASUS TUF Gaming A15 FA507NV-LP025",\n      "—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1920x1080",\n      "—Ç–∏–ø_—ç–∫—Ä–∞–Ω–∞": "IPS",\n      "–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä": "AMD Ryzen 5 7535HS, 6 —è–¥–µ—Ä, 3.3 –ì–ì—Ü",\n      "–æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–∞—è_–ø–∞–º—è—Ç—å": "16 –ì–ë",\n      "–Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å": "SSD 512 –ì–ë",\n      "–≤–∏–¥–µ–æ–∫–∞—Ä—Ç–∞": "GeForce RTX 4060 –¥–ª—è –Ω–æ—É—Ç–±—É–∫–æ–≤, 8 –ì–ë",\n      "–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω–∞—è_—Å–∏—Å—Ç–µ–º–∞": "–±–µ–∑ –û–°",\n  

In [4]:
docs[0].page_content

'[\n    {\n        "–Ω–∞–∑–≤–∞–Ω–∏–µ": "Acer Aspire 3 A317-51G",\n        "—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1600x900",\n        "–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä": "Intel Core i5 1.6 –ì–ì—Ü",\n        "–æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–∞—è_–ø–∞–º—è—Ç—å": "8 –ì–ë",\n        "–Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å": "SSD 256 –ì–ë",\n        "–≤–∏–¥–µ–æ–∫–∞—Ä—Ç–∞": "GeForce MX230",\n        "–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω–∞—è_—Å–∏—Å—Ç–µ–º–∞": "Linux",\n        "—Ü–µ–Ω–∞": "57000 ‚ÇΩ"\n    },\n    {\n      "–Ω–∞–∑–≤–∞–Ω–∏–µ": "ASUS TUF Gaming A15 FA507NV-LP025",\n      "—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1920x1080",\n      "—Ç–∏–ø_—ç–∫—Ä–∞–Ω–∞": "IPS",'

In [5]:
docs[1].page_content

'"—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1920x1080",\n      "—Ç–∏–ø_—ç–∫—Ä–∞–Ω–∞": "IPS",\n      "–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä": "AMD Ryzen 5 7535HS, 6 —è–¥–µ—Ä, 3.3 –ì–ì—Ü",\n      "–æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–∞—è_–ø–∞–º—è—Ç—å": "16 –ì–ë",\n      "–Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å": "SSD 512 –ì–ë",\n      "–≤–∏–¥–µ–æ–∫–∞—Ä—Ç–∞": "GeForce RTX 4060 –¥–ª—è –Ω–æ—É—Ç–±—É–∫–æ–≤, 8 –ì–ë",\n      "–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω–∞—è_—Å–∏—Å—Ç–µ–º–∞": "–±–µ–∑ –û–°",\n      "—Ä–∞—Å–∫–ª–∞–¥–∫–∞_–∫–ª–∞–≤–∏–∞—Ç—É—Ä—ã": "–∞–Ω–≥–ª–∏–π—Å–∫–∞—è/—Ä—É—Å—Å–∫–∞—è",\n      "—Ü–µ–Ω–∞": "104,999 ‚ÇΩ"\n    },\n    {\n      "–Ω–∞–∑–≤–∞–Ω–∏–µ": "ASUS Vivobook Go 15 E1504FA-BQ533",\n      "—Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ_—ç–∫—Ä–∞–Ω–∞": "1920x1080",'

In [6]:
retriever = db.as_retriever()
retrieved_docs = retriever.invoke("–∏–≥—Ä—ã")

retrieved_docs

[Document(metadata={'source': 'docs/raw_recomm_papers_2.txt'}, page_content='i5 8–≥–æ –ø–æ–∫–æ–ª–µ–Ω–∏—è, —Å–µ–Ω—Å–æ—Ä–Ω—ã–π FullHD-—ç–∫—Ä–∞–Ω, SSD –Ω–∞ 256–≥–±, –±–∞—Ç–∞—Ä–µ—è –≤ —Å–æ—Å—Ç–æ—è–Ω–∏–∏ "–º—É—Ö–∞ –Ω–µ —Å–∏–¥–µ–ª–∞" –∏ 8–≥–± –æ–ø–µ—Ä–∞—Ç–∏–≤–∫–∏, –∫–æ—Ç–æ—Ä—É—é —è –Ω–∞—Ä–∞—Å—Ç–∏–ª –¥–æ 16–≥–±.'),
 Document(metadata={'source': 'docs/test.json'}, page_content='"—Ä–∞—Å–∫–ª–∞–¥–∫–∞_–∫–ª–∞–≤–∏–∞—Ç—É—Ä—ã": "–∞–Ω–≥–ª–∏–π—Å–∫–∞—è/—Ä—É—Å—Å–∫–∞—è",\n      "—Ü–µ–Ω–∞": "20,000 ‚ÇΩ"\n    }\n  ]'),
 Document(metadata={'source': 'docs/raw_recomm_papers_1.txt'}, page_content='–º–µ–º–æ–≤ –Ω–∞\xa0–ª—é–±–æ–π —Å–ª—É—á–∞–π –∂–∏–∑–Ω–∏–ö—É–¥–∞ —Å—Ö–æ–¥–∏—Ç—å –≤\xa0–ú–æ—Å–∫–≤–µ –∑–∏–º–æ–π¬´–ü–æ—Ç—Ä–∞—Ç–∏–ª –º–Ω–æ–≥–æ –¥–µ–Ω–µ–≥, –≤—Ä–µ–º–µ–Ω–∏ –∏\xa0—Å–∏–ª¬ª: 4\xa0–∏—Å—Ç–æ—Ä–∏–∏ –ø—Ä–æ\xa0–º–∞—à–∏–Ω—ã, –∫–æ—Ç–æ—Ä—ã–µ –¥–æ—Ä–æ–≥–æ —Å–æ–¥–µ—Ä–∂–∞—Ç—å–§–∏–Ω–∞–Ω—Å–æ–≤–æ–µ –ø—Ä–∞–≤–∏–ª–æ: —è\xa0–∏–Ω–≤–µ—Å—Ç–∏—Ä—É—é –º–µ–ª–æ—á—å –Ω–∞\xa0–ò–ò–°, —á—Ç–æ–±—ã –ø—Ä–µ–≤—Ä–∞—Ç–∏—Ç—å –µ–µ\xa0–≤\xa0–∑–Ω–∞—á–∏—Ç–

In [7]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models.gigachat import GigaChat
from langchain.chains import create_retrieval_chain

In [8]:
llm = GigaChat(
    credentials=os.environ['GIGACHAT_API_ACCESS_KEY'],
    model="GigaChat",
    verify_ssl_certs=False,
)

prompt = ChatPromptTemplate.from_template(''' –¢—ã - –∫–æ–Ω—Å—É–ª—å—Ç–∞–Ω—Ç –≤ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–º –∏–Ω—Ç–µ—Ä–Ω–µ—Ç-–º–∞–≥–∞–∑–∏–Ω–µ \
                                          –û—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è. \
–ò—Å–ø–æ–ª—å–∑—É–π –ø—Ä–∏ —ç—Ç–æ–º –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –∏–∑ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞. –ï—Å–ª–∏ –≤ –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ –Ω–µ—Ç \
–∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –¥–ª—è –æ—Ç–≤–µ—Ç–∞, –ø–æ–ø—Ä–æ—Å–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è —É—Ç–æ—á–Ω–∏—Ç—å –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –¥–µ—Ç–∞–ª–∏.
–ö–æ–Ω—Ç–µ–∫—Å—Ç: {context}
–í–æ–ø—Ä–æ—Å: {input}
–û—Ç–≤–µ—Ç:'''
)


  llm = GigaChat(


In [9]:
document_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt,
)

In [10]:
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [11]:
question = "–ö–∞–∫–æ–π –ª—É—á—à–µ –Ω–æ—É—Ç–±—É–∫ –ø–æ–¥–æ–π–¥—ë—Ç –¥–ª—è —É—á—ë–±—ã (c—Ç—É–¥–µ–Ω—Ç —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–≥–æ –≤—É–∑–∞ —Å —É–≥–ª—É–±–ª–µ–Ω–∏–µ–º –≤ –ò–ò –∏ –Ω–µ–π—Ä–æ—Å–µ—Ç–∏)?"

In [20]:
question2= "–ú–Ω–µ –Ω—É–∂–µ–Ω –Ω–æ—É—Ç–±—É–∫ –¥–æ 50.000—Ä –∏ –æ—Ç —Ñ–∏—Ä–º—ã Acer?"


In [21]:
result = retrieval_chain.invoke({'input': question2})

In [22]:
with open('result.md', 'w', encoding='utf-8') as f:
    f.write(question+'\n\n')
    
    f.write(result['answer'])