### Install required packages

In [7]:
%pip install -r ../requirement.txt

Collecting atlassian-python-api (from -r ../requirement.txt (line 2))
  Using cached atlassian_python_api-3.41.14-py3-none-any.whl.metadata (8.8 kB)
Collecting langchain (from -r ../requirement.txt (line 3))
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community (from -r ../requirement.txt (line 4))
  Downloading langchain_community-0.2.13-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_openai (from -r ../requirement.txt (line 5))
  Downloading langchain_openai-0.1.23-py3-none-any.whl.metadata (2.6 kB)
Collecting pymongo (from -r ../requirement.txt (line 7))
  Using cached pymongo-4.8.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (22 kB)
Collecting pytesseract (from -r ../requirement.txt (line 9))
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image (from -r ../requirement.txt (line 11))
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting docx2txt (from -r ../requirement.txt (l

## Import Secret

In [8]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

# Confluence
CONFLUENCE_TOKEN = os.getenv('CONFLUENCE_TOKEN')

# Database
COSMOSDB_VCORE_CONNECTION_STRING = os.getenv('COSMOSDB_VCORE_CONNECTION_STRING')
COSMOSDB_NAMESPACE = os.getenv('COSMOSDB_NAMESPACE')

# Azure OpenAI API
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
OPENAI_API_VERSION = os.getenv('OPENAI_API_VERSION')

# LLM
AZURE_OPENAI_LLM_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_LLM_DEPLOYMENT_NAME')
AZURE_OPENAI_LLM_MODEL_NAME = os.getenv('AZURE_OPENAI_LLM_MODEL_NAME')

# Embedding
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
AZURE_OPENAI_EMBEDDING_MODEL_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_MODEL_NAME')


<img src="images/confluence-Loader-vectorstore.png" alt="confluence loader" height="100">


## Pull Confluence Documents

In [9]:
from langchain.document_loaders import ConfluenceLoader

loader = ConfluenceLoader(
    url="https://strive.devops.t-systems.net/confluence/",
    token=CONFLUENCE_TOKEN
)

confluence_documents = loader.load(
    space_key="SDV", 
    include_attachments=False, 
    limit=1000,
    max_pages=1000,
)

print(confluence_documents[0])
print(f'{len(confluence_documents)} documents read from Confluence.')

Received runtime arguments {'space_key': 'SDV', 'include_attachments': False, 'limit': 1000, 'max_pages': 1000}. Passing runtime args to `load` is deprecated. Please pass arguments during initialization instead.


page_content='Introduction T-Systems brings a Global Presence, 5000 experts and a unique SDV technology acumen built in collaboration with car manufacturers over the past  years that enabled our team to design Cutting-Edge “Open Assets” T-Systems offers a unique SDV Solution Approach combining Services and “Open Assets” that is shortening time-to-market, matching any customer technology eco-systems, and empowering OEM with New Tech, IP and knowledge In collaboration with our sister company T-Mobile (DTAG), T-Systems bring together SDV’s 3 world, Back-end platform, Embedded Systems and Connectivity offering a unique End-to-End strategic prospective to OEM SDV Knowledge Base #303030 #303030 1 solid 40% Welcome to the SDV Knowledge Base 🤗 Here you can find valuable knowledge about our work and projects. The Purpose of this Knowledge Base is to make sure everyone gets the right knowledge at the right time. This can only function when everyone is contributing to knowledge sharing 🙏 So, feel

## Split Confluence Documents

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024, 
    chunk_overlap = 300,
    length_function = len)

splitted_documents = text_splitter.split_documents(confluence_documents)

print(f'{len(splitted_documents)} chunks generated.')
print(splitted_documents[0])

4151 chunks generated.
page_content='Introduction T-Systems brings a Global Presence, 5000 experts and a unique SDV technology acumen built in collaboration with car manufacturers over the past  years that enabled our team to design Cutting-Edge “Open Assets” T-Systems offers a unique SDV Solution Approach combining Services and “Open Assets” that is shortening time-to-market, matching any customer technology eco-systems, and empowering OEM with New Tech, IP and knowledge In collaboration with our sister company T-Mobile (DTAG), T-Systems bring together SDV’s 3 world, Back-end platform, Embedded Systems and Connectivity offering a unique End-to-End strategic prospective to OEM SDV Knowledge Base #303030 #303030 1 solid 40% Welcome to the SDV Knowledge Base 🤗 Here you can find valuable knowledge about our work and projects. The Purpose of this Knowledge Base is to make sure everyone gets the right knowledge at the right time. This can only function when everyone is contributing to knowl

## Setup Openai Embeddings

In [11]:
from langchain_openai import AzureOpenAIEmbeddings

openai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME,
    openai_api_version=OPENAI_API_VERSION,
    model=AZURE_OPENAI_EMBEDDING_MODEL_NAME,
    embedding_ctx_length=8191,
    chunk_size= 1024 
)
print(openai_embeddings)

client=<openai.resources.embeddings.Embeddings object at 0x122d67590> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1223445c0> model='text-embedding-ada-002' dimensions=None deployment='embedding-test-function' openai_api_version='2023-07-01-preview' openai_api_base=None openai_api_type='azure' openai_proxy='' embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1024 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True azure_endpoint='https://conmob-openai-deployments.openai.azure.com/' azure_ad_token=None azure_ad_token_provider=None validate_base_url=True


<img src="images/vectorstore.png" alt="Example Image" width="72" height="72">

## Store Documents to COSMOSDB vCore

In [12]:
from langchain_community.vectorstores.azure_cosmos_db import (
    AzureCosmosDBVectorSearch,
    CosmosDBSimilarityType,
)
from pymongo import MongoClient

_indexName = 'km-index'
_dbName, _collectionName = COSMOSDB_NAMESPACE.split(".")

client: MongoClient = MongoClient(COSMOSDB_VCORE_CONNECTION_STRING)
collection = client[_dbName][_collectionName]

collection.database.drop_collection(_collectionName)

vectorstore = AzureCosmosDBVectorSearch.from_documents(
    splitted_documents,
    openai_embeddings,
    collection=collection,
    index_name=_indexName,
)

num_lists = 100
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS

vectorstore.create_index(num_lists, dimensions, similarity_algorithm)


  client: MongoClient = MongoClient(COSMOSDB_VCORE_CONNECTION_STRING)


{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

<img src="images/llm.png" alt="llm" width="100" height="100">

# RAG

In [15]:
import time
from langchain.chat_models import AzureChatOpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

# Setup of the LLM and the chain
llm = AzureChatOpenAI(
    azure_deployment=AZURE_OPENAI_LLM_DEPLOYMENT_NAME, 
    model=AZURE_OPENAI_LLM_MODEL_NAME,
    temperature=0.8
)

chain = load_qa_with_sources_chain(
    llm, 
    chain_type="stuff"
)

query = "Does the context provide who the first man on the moon was?"
# query = "Who is the team lead of knowledge management?"
# query = "Who is Sirin Tiryaki?"
# query = "What is Knowledge Management?"
# query = "What is Knowledge Management? Explain in German."
# query = "Who is Jörg Tischler?"
# query = "Who is the Business Unit Lead in Connected Mobility at T-Systems?"
# query = "How can I book an external training?"
# query = "Explain SDV to a 10-year-old."
# query = "what are hypercubes?"
# query = "what are hypercubes?"

matching_docs = vectorstore.similarity_search(query, k=5)

answer = chain.run(input_documents=matching_docs, question=query)

print(f"Found {len(matching_docs)} matching documents.")
print(f"Answer: {answer}")

Found 5 matching documents.
Answer: The context does not provide information on who the first man on the moon was.
SOURCES:
