# 14.1 内部知识库 Q&A 系统

完整实现一个文档问答系统（RAG）。

**预估成本**: ~$0.10

In [None]:
!pip install openai chromadb -q

In [None]:
import os
from openai import OpenAI
import chromadb
import hashlib

client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

## 知识库系统实现

In [None]:
class KnowledgeBase:
    def __init__(self, collection_name='docs'):
        self.client = OpenAI()
        self.chroma = chromadb.Client()
        self.collection = self.chroma.get_or_create_collection(collection_name)
    
    def add_document(self, content, source, metadata=None):
        chunks = self._split_chunks(content, 500)
        for i, chunk in enumerate(chunks):
            emb = self._embed(chunk)
            chunk_id = hashlib.md5(f'{source}_{i}'.encode()).hexdigest()
            self.collection.add(
                ids=[chunk_id],
                embeddings=[emb],
                documents=[chunk],
                metadatas=[{'source': source, 'chunk_index': i, **(metadata or {})}]
            )
        print(f'✓ 已添加: {source} ({len(chunks)} 片段)')
    
    def _split_chunks(self, text, size):
        words = text.split()
        return [' '.join(words[i:i+size]) for i in range(0, len(words), size)]
    
    def _embed(self, text):
        r = self.client.embeddings.create(model='text-embedding-3-small', input=text)
        return r.data[0].embedding
    
    def search(self, question, top_k=3):
        emb = self._embed(question)
        results = self.collection.query(query_embeddings=[emb], n_results=top_k)
        return [{
            'content': results['documents'][0][i],
            'source': results['metadatas'][0][i]['source'],
            'distance': results['distances'][0][i]
        } for i in range(len(results['ids'][0]))]
    
    def ask(self, question):
        docs = self.search(question, 3)
        if not docs:
            return {'answer': '知识库中未找到相关信息', 'sources': []}
        
        docs_text = '\n\n'.join([f"[文档{i+1}] {d['source']}\n{d['content']}" for i, d in enumerate(docs)])
        prompt = f'''基于以下文档回答问题。只使用文档内容，不要编造。

文档：
{docs_text}

问题：{question}

回答：'''
        
        response = self.client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[{'role': 'user', 'content': prompt}],
            temperature=0.3
        )
        
        return {
            'answer': response.choices[0].message.content,
            'sources': [{'source': d['source'], 'snippet': d['content'][:100]} for d in docs]
        }

# 测试
kb = KnowledgeBase()
kb.add_document(
    'Nginx SSL配置：使用certbot生成证书，配置/etc/nginx/sites-available/default，重启nginx。',
    'nginx-ssl.md'
)

result = kb.ask('如何配置Nginx SSL？')
print(f"回答：{result['answer']}")
print(f"来源：{result['sources']}")