# Chat with contents in Magnolia CMS

Implementation of LangChain loader for contents in Magnolia CMS and example of usage to make questions to Magnolia

## 1. Contents Loading

In [None]:
# !pip install langchain
# !pip install langchain-community

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json

def fetchMagnoliaContents(url, username, password):
    """Fetches contents from `Magnolia CMS`
    
    It lays on restEndpoints defined in Magnolia
    """
    basic = HTTPBasicAuth(username, password)
    response = requests.get(url=url, auth=basic)
    if not response.ok:
        raise NameError(f'Failed to fetch {url} from Magnolia: {response.status_code}')
    response.encoding = 'utf-8' # TODO is it necessary to specify encoding? 
    content = json.loads(response.text)
    
    return content['results']


In [None]:
from langchain_core.documents import Document

def createDocumentFromMagnoliaContent(magnoliaContent, contentProperty, url) -> Document:
    """ Creates langchain Document from schema of contents in Magnolia

    It maps the schema of Magnolia resEndpoints with langchain Document
    """
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        raise ImportError(
                    "`beautifulsoup4` package not found, please run "
                    "`pip install beautifulsoup4`"
                )
    content = magnoliaContent[contentProperty]
    plainTextContent = BeautifulSoup(content).get_text(
                    " ", strip=True
                )
    
    return Document(
        page_content=plainTextContent,
        metadata={
            "id": magnoliaContent['@id'],
            "name": magnoliaContent['@name'],
            "path": magnoliaContent['@path'],
            "url": f'{url}{magnoliaContent["@path"]}',
            "nodeType": magnoliaContent['@nodeType'],
            "lastModified": magnoliaContent["mgnl:lastModifiedBy"],
            "lastModifiedBy": magnoliaContent["mgnl:lastModified"]
        }
    )

In [None]:
from typing import Iterator, List

def _lazy_load(url, contentProperty, username, password) -> Iterator[Document]:
    magnoliaContents = fetchMagnoliaContents(url=url, username=username, password=password)
    for magnoliaContent in magnoliaContents:
        yield createDocumentFromMagnoliaContent(magnoliaContent=magnoliaContent, contentProperty=contentProperty, url=url)
        
def load(url, username, password, contentProperty) -> List[Document]:
    """ Loads documents from a given restEndpoint of Magnolia CMS
    url: specifies the url of the endpoint
    contentProperty: specifies the property used as source of text for embedding
    """
    return list(_lazy_load(url=url, contentProperty=contentProperty, username=username, password=password))

docs = load(
    url='http://localhost:8080/.rest/delivery/tours/v1',
    username='superuser',
    password='superuser',
    contentProperty='body'
)

## 2. Contents Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=180,
    chunk_overlap=14
)

chunks = splitter.split_documents(docs)

## 3. Contents Embedding and Storage

In [None]:
from langchain_core.embeddings import Embeddings

def createEmbeddingOllama() -> Embeddings:
    """
    Returns Embedding model using local models in Ollama
    """
    return OllamaEmbeddings(
        model='nomic-embed-text'
    ) # Run `ollama pull nomic-embed-text` to pull down the model

In [None]:
# !pip install -U langchain-openai

In [None]:
import getpass
import os
from langchain_openai import OpenAIEmbeddings

def createEmbeddingOpenAI() -> Embeddings:
    """
    Returns Embedding model using OpenAI models
    """

    os.environ["OPENAI_API_KEY"] = getpass.getpass('To continue, specify an OpenAI API Key')
    return OpenAIEmbeddings(model='text-embedding-3-small')

In [None]:
# !pip install langchain-chroma

In [None]:
from langchain.vectorstores import Chroma

embedding = createEmbeddingOpenAI()

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding
)

## 4. Contents Retrieval

In [None]:
question = "Find tours for cycling"
similarDocs = vectordb.similarity_search(question,k=5)

for doc in similarDocs:
    name = doc.metadata["name"]
    content = doc.page_content
    print(f'{name}')
    print(len(name)*'-')
    print(f'{content}\n')