# Retrieval-augmented generation

## Build vector store

### Option 1: OpenAI Embeddings

In [2]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

### Option 2: Manifestoberta Embeddings

Manifestoberta needs to be wrapped into an Embedding class compatible with LangChain.

In [3]:
from transformers import AutoModel, AutoTokenizer
from langchain_core.embeddings import Embeddings
import torch
from typing import List

class ManifestoBertaEmbeddings(Embeddings):
    """Embeddings using ManifestoBerta for use with LangChain."""

    def __init__(self):
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(
            "manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1"
        )
        self.model = AutoModel.from_pretrained(
            "manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1"
        )

    def _embed(self, text: str, sentence_level=True) -> List[float]:
        """Embed a text using ManifestoBerta.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """

        # Encode the text
        inputs = self.tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=512
        )

        # Get model output (make sure to set output_hidden_states to True)
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)

        # Extract the last hidden states
        last_hidden_states = outputs.hidden_states[-1]

        # Optionally, you can average the token embeddings for sentence-level representation
        if sentence_level:
            embedding = torch.mean(last_hidden_states, dim=1)
        else:
            embedding = last_hidden_states

        # Convert to list
        embedding_list = embedding.cpu().tolist()

        return embedding_list[0]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self._embed(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        # return self.embed_documents([text])[0] # previous version
        return self._embed(text)
    
embeddings = ManifestoBertaEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaModel were not initialized from the model checkpoint at manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Create or load vector database

In [4]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import os

# Build vector storage if not yet existing
if not os.path.exists("./chroma"):

    loader = PyPDFDirectoryLoader(
        "../data/manifestos/01_pdf_originals/"
    )  # file name and page number are stored as metadata when loading from directory
    
    # loader = PyPDFDirectoryLoader("./dummy_pdf") # alternative for quick testing

    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    splits = text_splitter.split_documents(docs)

    db = Chroma.from_documents(splits, embeddings, persist_directory="./chroma")

# Load the vector storage if it already exists
else:
    db = Chroma(persist_directory="./chroma", embedding_function=embeddings)

## Retrieval and generation

TO DO: Develop solution to retrieve from multiple parties in a balanced way (e.g., using multiple retrievers).

In [20]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(model_name="gpt-3.5-turbo",
                 max_tokens=1000,
                 temperature=0.7)

retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 10,
        # "where": {  # https://docs.trychroma.com/usage-guide#using-where-filters
        #     "source": "" # TODO: add an exact file name here for testing purposes
        # },
    },
)

question_prompt = ChatPromptTemplate.from_template("""
Du hilfst dabei, die politischen Positionen verschiedener Parteien zur Europawahl 2024 zusammenzufassen und zu vergleichen.
Beantworte die folgende Frage nur auf dem zur Verfügung gestellten Kontext.
Falls sich die Frage auf Basis des Kontexts nicht beantworten lässt, gib eine kurze Begründung an.
                                          
KONTEXT:
{context}

FRAGE: {question}

"""
)

### Basic chain

In [6]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | question_prompt
    | llm
    | StrOutputParser()
)

chain.invoke("Welche Ansätze zur Bewältigung der Klimakrise werden von den verschiedenen Parteien vorgeschlagen?")

### Slightly better chain

In [21]:
from langchain_core.runnables import RunnableParallel
from langchain_core.prompts import PromptTemplate, format_document

def _combine_documents(docs, document_separator="\n\n"):
    document_prompt = PromptTemplate.from_template(
        template="Ausschnitt aus dem Europawahlprogramm 2024 '{source}': \n {page_content}"
    )
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

_inputs = RunnableParallel(
    {
        "question": RunnablePassthrough(),
        "context": RunnablePassthrough() | retriever | _combine_documents,
    }
)

chain = _inputs | question_prompt | llm | StrOutputParser()

chain.invoke(
    "Welche Ansätze zur Bewältigung der Klimakrise werden von den verschiedenen Parteien vorgeschlagen?"
)

'Die Linke setzt auf eine konsequente Energiewende und erneuerbare Energien, kritisiert jedoch konservative und liberale Parteien für ihre angebliche Verlangsamung der Energiewende zugunsten fossiler Energieträger. Die AfD hingegen zweifelt den menschengemachten Klimawandel an und setzt auf Anpassungsstrategien an vermeintliche natürliche Klimaveränderungen. Sie lehnt den Ausbau von Windkraftanlagen ab und betont den Erhalt von fossilen Energieträgern. Die Grünen hingegen setzen auf Naturschutzmaßnahmen zur Bewältigung der Klimakrise, wie die Renaturierung von Gewässern und den Ausbau erneuerbarer Energien.'