In [1]:
!pip install -q gradio
!pip install -q langchain langchain-docling langchain-community langchain-huggingface
!pip install -q unsloth transformers torch accelerate docling
!pip install -q chromadb sentence-transformers
!pip install -q python-dotenv pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.5/187.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.1/158.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.1/86.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m74.7 MB/s[0m eta [3

In [2]:
import gradio as gr
from typing import List, Tuple, Generator, Optional, Dict, Any, Union
import time

import os
import torch
from pathlib import Path
import json
import logging

from langchain_docling import DoclingLoader
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, T5ForConditionalGeneration, T5Tokenizer
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('rag_system.log', encoding='utf-8')
    ]
)
logger = logging.getLogger(__name__)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
def clean_metadata(metadata: Dict[str, Any]) -> Dict[str, Union[str, int, float, bool, None]]:
    """
    Очищает метаданные от сложных структур для совместимости с Chroma.

    Args:
        metadata: Исходные метаданные

    Returns:
        Очищенные метаданные
    """
    cleaned = {}

    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool, type(None))):
            cleaned[key] = value
        elif isinstance(value, dict):
            if key == 'dl_meta' and isinstance(value, dict):
                if 'doc_items' in value and isinstance(value['doc_items'], list):
                    for item in value['doc_items']:
                        if isinstance(item, dict) and 'prov' in item:
                            prov_list = item.get('prov', [])
                            if prov_list and isinstance(prov_list[0], dict):
                                page_no = prov_list[0].get('page_no')
                                if page_no:
                                    cleaned['page_number'] = page_no
                                    break

                if 'headings' in value and isinstance(value['headings'], list):
                    if value['headings']:
                        cleaned['section'] = str(value['headings'][0])

                if 'origin' in value and isinstance(value['origin'], dict):
                    filename = value['origin'].get('filename')
                    if filename:
                        cleaned['source'] = filename
            else:
                cleaned[key] = str(value)
        elif isinstance(value, list):
            if value and all(isinstance(item, (str, int, float)) for item in value):
                cleaned[key] = ', '.join(str(item) for item in value)
            else:
                cleaned[key] = str(value)
        else:
            cleaned[key] = str(value)

    return cleaned

In [5]:
class RAG:
    """
    Универсальная RAG система для работы с технической документацией.
    Поддерживает русский язык, OCR и различные форматы документов.
    """
    def __init__(
        self,
        model_id: str = "unsloth/Llama-3.2-1B-Instruct",
        embed_model_id: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        llm: Optional[Any] = None,
        embeddings: Optional[Any] = None
    ):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.model_id = model_id
        self.embed_model_id = embed_model_id
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.device = device

        if llm:
            self.llm = llm
            self.logger.info("Используется предзагруженная LLM")
        else:
            self._init_llm()

        if embeddings:
            self.embeddings = embeddings
            self.logger.info("Используются предзагруженные эмбеддинги")
        else:
            self._init_embeddings()

        self.vectorstore = None
        self.retriever = None

    def _init_llm(self):
        """Инициализация языковой модели."""
        self.logger.info(f"Загрузка модели {self.model_id}...")

        tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
            device_map="auto" if self.device == "cuda" else None,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.1,
            top_p=0.95,
            repetition_penalty=1.15,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            truncation=True,
            max_length=2048
        )

        self.llm = HuggingFacePipeline(pipeline=pipe)
        self.logger.info("Модель загружена успешно!")


    def _init_embeddings(self):
        """Инициализация модели эмбеддингов."""
        self.logger.info(f"Загрузка модели эмбеддингов {self.embed_model_id}...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=self.embed_model_id,
            model_kwargs={'device': self.device}
        )
        self.logger.info("Модель эмбеддингов загружена успешно!")

    def load_document(self, file_path: str | List[str]) -> List[Any]:
        """
        Загрузка и обработка документов с использованием Docling.

        Args:
            file_path: Путь к файлу или список путей

        Returns:
            Список документов LangChain
        """
        if isinstance(file_path, str):
            file_path = [file_path]

        self.logger.info(f"Загрузка документов с VLM: {file_path}")

        try:
            converter = DocumentConverter(
                format_options={
                    InputFormat.PDF: PdfFormatOption(
                        pipeline_cls=VlmPipeline,
                    ),
                }
            )

            all_docs = []

            for path in file_path:
                self.logger.info(f"Обработка файла: {path}")

                result = converter.convert(source=path)
                doc = result.document

                markdown_content = doc.export_to_markdown()

                metadata = {
                    'source': path,
                    'filename': Path(path).name,
                    'format': 'pdf',
                    'converter': 'VLM Docling'
                }

                if hasattr(doc, 'metadata') and doc.metadata:
                    for key, value in doc.metadata.items():
                        if isinstance(value, (str, int, float, bool)):
                            metadata[key] = value

                sections = markdown_content.split('\n\n')

                for i, section in enumerate(sections):
                    if section.strip():
                        section_metadata = metadata.copy()
                        section_metadata['section_index'] = i

                        lines = section.strip().split('\n')
                        if lines and lines[0].startswith('#'):
                            section_metadata['section'] = lines[0].strip('#').strip()

                        langchain_doc = Document(
                            page_content=section.strip(),
                            metadata=section_metadata
                        )
                        all_docs.append(langchain_doc)

            self.logger.info(f"Загружено {len(all_docs)} секций из VLM Docling")

            cleaned_docs = []
            for doc in all_docs:
                cleaned_metadata = clean_metadata(doc.metadata)
                cleaned_doc = Document(
                    page_content=doc.page_content,
                    metadata=cleaned_metadata
                )
                cleaned_docs.append(cleaned_doc)

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap,
                length_function=len,
                separators=["\n\n", "\n", ".", " ", ""]
            )

            chunked_docs = []
            for doc in cleaned_docs:
                if len(doc.page_content) < 50:
                    continue

                chunks = text_splitter.split_text(doc.page_content)
                for i, chunk in enumerate(chunks):
                    chunk_metadata = doc.metadata.copy()
                    chunk_metadata['chunk_index'] = i
                    chunk_metadata['total_chunks'] = len(chunks)

                    chunked_docs.append(Document(
                        page_content=chunk,
                        metadata=chunk_metadata
                    ))

            self.logger.info(f"Создано {len(chunked_docs)} чанков документов")
            return chunked_docs

        except Exception as e:
            self.logger.error(f"Ошибка при загрузке через VLM Docling: {e}")
            self.logger.warning("Используем альтернативный загрузчик...")

            all_docs = []
            for path in file_path:
                try:
                    loader = PyPDFLoader(path)
                    docs = loader.load()

                    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=self.chunk_size,
                        chunk_overlap=self.chunk_overlap
                    )

                    split_docs = text_splitter.split_documents(docs)
                    all_docs.extend(split_docs)

                except Exception as e2:
                    self.logger.error(f"Ошибка при загрузке {path}: {e2}")

            self.logger.info(f"Загружено {len(all_docs)} чанков через PyPDF")
            return all_docs


    def create_index(self, documents: List[Any], collection_name: str = "universal_rag"):
        """
        Создание векторного индекса из документов.

        Args:
            documents: Список документов LangChain
            collection_name: Название коллекции в векторной БД
        """
        self.logger.info("Создание векторного индекса...")

        filtered_docs = filter_complex_metadata(documents)

        self.vectorstore = Chroma.from_documents(
            documents=filtered_docs,
            embedding=self.embeddings,
            collection_name=collection_name,
            persist_directory="./chroma_db"
        )

        self.retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        )

        self.logger.info("Индекс создан успешно!")

    def _create_prompt_template(self) -> PromptTemplate:
        """Создание шаблона промпта для RAG."""
        template = """Ты - помощник, отвечающий на вопросы строго на основе предоставленного контекста.

        Контекст из документации:
        {context}

        Вопрос пользователя: {question}

        Инструкции:
        1. Отвечай ТОЛЬКО на основе информации из контекста
        2. Если информация отсутствует в контексте, честно скажи "Информация не найдена в документе"
        3. Если вопрос не относится к теме документа, скажи "Вопрос не относится к содержанию документа"
        4. Указывай источники информации (номера страниц, если доступны)
        5. Отвечай на том же языке, на котором задан вопрос

        Ответ:"""

        return PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )

    def answer_question(self, question: str, return_sources: bool = True) -> Dict[str, Any]:
        """
        Ответ на вопрос с использованием RAG.

        Args:
            question: Вопрос пользователя
            return_sources: Возвращать ли источники

        Returns:
            Словарь с ответом и метаданными
        """
        if not self.retriever:
            raise ValueError("Индекс не создан. Сначала загрузите документы и создайте индекс.")

        relevant_docs = self.retriever.get_relevant_documents(question)

        if not relevant_docs:
            return {
                "answer": "Информация не найдена в документе",
                "sources": [],
                "relevant_chunks": []
            }

        prompt = self._create_prompt_template()

        context_parts = []
        sources = []

        for i, doc in enumerate(relevant_docs):
            context_parts.append(f"[Фрагмент {i+1}]:\n{doc.page_content}")

            metadata = doc.metadata
            source_info = {
                "chunk_id": i + 1,
                "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
            }

            for page_key in ['page_number', 'page', 'page_no']:
                if page_key in metadata:
                    source_info['page_no'] = metadata[page_key]
                    break

            if 'section' in metadata:
                source_info['section'] = metadata['section']

            sources.append(source_info)

        context = "\n\n".join(context_parts)

        try:
            formatted_prompt = prompt.format(context=context, question=question)
            answer = self.llm.invoke(formatted_prompt)

            if isinstance(answer, str):
                answer = answer.strip()
                if "Ответ:" in answer:
                    answer = answer.split("Ответ:")[-1].strip()

        except Exception as e:
            self.logger.error(f"Ошибка при генерации ответа: {e}")
            answer = "Произошла ошибка при генерации ответа"

        result = {
            "answer": answer,
            "question": question,
            "sources": sources if return_sources else [],
            "relevant_chunks": [doc.page_content for doc in relevant_docs] if return_sources else []
        }

        return result

    def process_questions(self, questions: List[str]) -> List[Dict[str, Any]]:
        """
        Обработка списка вопросов.

        Args:
            questions: Список вопросов

        Returns:
            Список ответов с метаданными
        """
        results = []

        for i, question in enumerate(questions, 1):
            self.logger.info(f"Обработка вопроса {i}/{len(questions)}: {question}")
            result = self.answer_question(question)
            results.append(result)

            answer_preview = result['answer'][:200] + "..." if len(result['answer']) > 200 else result['answer']
            self.logger.info(f"Ответ: {answer_preview}")

        return results

In [6]:
logger.info("Предзагрузка моделей")
global_llm = None
global_embeddings = None

def preload_models():
    """Предзагружает модели для ускорения работы"""
    global global_llm, global_embeddings

    logger.info("Предзагрузка LLM...")
    temp_rag = RAG()
    global_llm = temp_rag.llm
    global_embeddings = temp_rag.embeddings
    logger.info("Модели предзагружены")

preload_models()

def process_uploaded_file_and_question(file_path: str, question: str) -> str:
    """Обрабатывает загруженный файл и вопрос"""
    try:
        rag_system = RAG(
            llm=global_llm,
            embeddings=global_embeddings
        )

        logger.info(f"Загрузка документа: {file_path}")
        documents = rag_system.load_document(file_path)


        logger.info("Создание векторного индекса...")
        rag_system.create_index(documents)


        logger.info(f"Обработка вопроса: {question}")
        result = rag_system.answer_question(question)

        response = result['answer']
        if result['sources']:
            response += "\n\n Источники:\n"
            for source in result['sources']:
                page_info = f" (страница {source['page_no']})" if 'page_no' in source else ""
                response += f"- {source['content']}{page_info}\n"

        return response

    except Exception as e:
        logger.exception(f"Ошибка обработки: {e}")
        return f" Критическая ошибка: {str(e)}"

with gr.Blocks(title="RAG для технической документации", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## RAG система для технической документации")
    gr.Markdown("Загрузите PDF-документ и задайте вопрос по его содержанию")

    with gr.Row():
        with gr.Column(scale=3):
            file_input = gr.File(
                label="Загрузите PDF-документ",
                type="filepath",
                file_types=[".pdf"]
            )
        with gr.Column(scale=7):
            question_input = gr.Textbox(
                label="Ваш вопрос",
                placeholder="Задайте вопрос о документе...",
                lines=3
            )

    submit_btn = gr.Button(" Получить ответ", variant="primary")

    answer_output = gr.Textbox(
        label="Ответ системы",
        interactive=False,
        lines=10
    )


    submit_btn.click(
        fn=process_uploaded_file_and_question,
        inputs=[file_input, question_input],
        outputs=answer_output
    )

logger.info("Запуск Gradio интерфейса")
demo.launch(share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0
  self.llm = HuggingFacePipeline(pipeline=pipe)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eeb14a7ad231e9d841.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


