# Vorbereitungen

In [None]:
!pip install -U langchain langchain-community langchainhub langchain-openai chromadb==0.3.29 wikipedia-api

In [None]:
# OPENAI KEY lesen
import os
try:
    from google.colab import userdata
    OPENAI_KEY = userdata.get('OPENAI_KEY')
except:
    OPENAI_KEY = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_KEY


# Beispiel-Dokumente von Wikipedia

In [None]:
import wikipediaapi
from pathlib import Path

In [None]:
page_name = 'Matrix_(Film)'

wiki = wikipediaapi.Wikipedia('LangChain RAG', 'de', extract_format=wikipediaapi.ExtractFormat.WIKI)
page = wiki.page(page_name)
Path(f'data/wiki/{page_name}.txt').write_text(page.text)

# LangChain RAG

## Preprocessing - Dokument laden

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [None]:
loader = TextLoader('data/wiki/Matrix_(Film).txt')
docs = loader.load()

In [None]:
len(docs)

## Preprocessing - Dokument chunken

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
doc_chunks = text_splitter.split_documents(docs)

In [None]:
doc_chunks[:4]

## Chunks embedden und in Vektor-Datenbank speichern

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [None]:
vectorstore = Chroma.from_documents(documents=doc_chunks, embedding=OpenAIEmbeddings(model='text-embedding-3-small'))

In [None]:
# Suche nach passenden Chunks
query = "Wer sind die Hauptdarsteller*innen in Matrix?"
result_docs = vectorstore.similarity_search(query)

In [None]:
result_docs

## Q&A - Passende Antwort mit LLM erzeugen lassen

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

In [None]:
llm = ChatOpenAI(model='gpt-3.5-turbo')

### Prompt zusamen bauen

Wir bauen einen Prompt mit folgender Struktur
- Aufgabe die erledigt werden soll ("Anhand der folgenden Quellen beantworte die Frage ...")
- Context (gefundene Dokumente vom vorherigen Schritt)
- Frage

In [None]:
sources_prompt = ''
for i, source in enumerate(result_docs):
    sources_prompt += f'Source {i+1}\n'
    sources_prompt += '---\n'
    sources_prompt += source.page_content
    sources_prompt += '\n---'
    sources_prompt += '\n\n'

In [None]:
print(sources_prompt)

In [None]:
prompt = f'''Based on the following sources answer the question of the user.

Sources:
{sources_prompt}

Question: {query}

Answer:'''

In [None]:
print(prompt)

### Prompt an OpenAI schicken und über Antwort freuen :)

In [None]:
result = llm.generate([[HumanMessage(content=prompt)]])

In [None]:
result.generations[0][0].text