In [None]:

#########################################
######### CONFLUENCE & CHUNKING #########
#########################################

from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

from dotenv import load_dotenv
import os
import openai 
load_dotenv()
#
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")

In [None]:

#########################################
# Ingest Confluence Page 
#########################################
from langchain.document_loaders import ConfluenceLoader

CONFLUENCE_API_TOKEN = "NDc5MjIzMjE5MjUzOheP7HLxuLUv2dVQrNF0Uxx9Xj50"
CONFLUENCE_BASE_URL =  "https://wiki.softtech.com.tr/"
CONFLUENCE_SPACE_KEY = "SDO"

loader = ConfluenceLoader(
    url=CONFLUENCE_BASE_URL, token=CONFLUENCE_API_TOKEN, cloud=False
)

docs = loader.load(space_key=CONFLUENCE_SPACE_KEY, include_attachments=True )

# Look at one page content and its metadata
print("Content: \n ------- \n" + docs[-1].page_content)
print("Metadatas: \n ------- \n" + str(docs[-1].metadata))



In [None]:
##################
# Split chunks 
##################
def my_custom_splitter(docs):
    # Markdown
    headers_to_split_on = [
        ("#", "Title 1"),
        ("##", "Subtitle-title 1"), 
        ("###", "Subtitle-title 2"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    # Split based on markdown and add original metadata
    md_docs = []
    for doc in docs:
        md_doc = markdown_splitter.split_text(doc.page_content)
        for i in range(len(md_doc)):
            md_doc[i].metadata = md_doc[i].metadata | doc.metadata
        md_docs.extend(md_doc)

    # RecursiveTextSplitter
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    # Chunk size big enough
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        separators=["\n\n", "\n", "(?<=\. )", " ", ""]
    )

    splitted_docs = splitter.split_documents(md_docs)
    return splitted_docs

In [None]:
chunks = my_custom_splitter(docs) 

In [None]:
def pretty_print(chunks):
    print(
        str('\n' + '='*50 + '\n').join(
            [
                chunk.page_content + '\n' +'-'*50 + '\n' + str(chunk.metadata) 
                for chunk in chunks
            ]
        )
    )

In [None]:
# chunks size
print("Number of cnk: " + str(len(chunks)))

pretty_print(chunks[:3])

In [None]:
 ##############################
 #### COLLECTION - Vector DB 
 ##############################

from langchain.vectorstores import Milvus
from langchain.embeddings import OpenAIEmbeddings 
from pymilvus import connections, utility, FieldSchema, Collection, CollectionSchema, DataType


COLLECTION_NAME = "SDO" # os.getenv("COLLECTION_NAME")
EMBEDDING_MODEL = os.getenv("OPENAI_ENGINE")
MILVUS_HOST = "23.236.50.189"
MILVUS_PORT = "19530"
DIMENSION = 1536
 

connections.connect(host=MILVUS_HOST, port=MILVUS_PORT) 
if connections.has_connection:
    print(f"Connection estabilished")

In [None]:
######################################
### CREATE COLLECTION & INDEX ########
######################################
if not utility.has_collection(COLLECTION_NAME):
        print(f"Creating {COLLECTION_NAME} collection")
         # 1. define fields
        fields = [
            FieldSchema(name='pk', dtype=DataType.INT64, descrition='pk', is_primary=True, auto_id=False),
            FieldSchema(name='id', dtype=DataType.INT32, descrition='page id', is_primary=False, auto_id=False),
            FieldSchema(name='title', dtype=DataType.VARCHAR, descrition='titles', max_length=5000),
            FieldSchema(name='page_content', dtype=DataType.VARCHAR, descrition='page_content', max_length=5000),
            FieldSchema(name='source', dtype=DataType.VARCHAR, descrition='sources', max_length=5000),
            FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, descrition='embedding vectors', dim=DIMENSION)
        ]
         # 2. enable dynamic schema in schema definition
        schema = CollectionSchema(fields=fields, description='wiki softtech devops pages')
        # 3. reference the schema in a collection
        collection = Collection(name=COLLECTION_NAME, schema=schema, consistency_level="Strong")
        # 4. index the vector field and load the collection
        INDEX_PARAM = {
            'metric_type': 'L2',
            'index_type': "HNSW",
            'params': {'M': 8, 'efConstruction': 64}
        } 

        collection.create_index(field_name="embedding", index_params=INDEX_PARAM)

         # 5. load the collection
        collection.load()  

        print(f"{COLLECTION_NAME} collection loaded") 

else:
    collection = Collection(f"{COLLECTION_NAME}")
    collection.load() 
    print(f"{COLLECTION_NAME} collection reloaded")


In [None]:
print(f"Creating {COLLECTION_NAME} collection")
collection.flush()
collection.load()

In [None]:
##########################
### INSERT Chunks ########
##########################
  
import numpy as np
from tqdm import tqdm
import time
 

def insert_chunks(chunks, embed, collection):
    try:
        cnk = chunks[:4]
        BATCH_SIZE = 2
        batch_data = [[], [], [], [], [], []]
        start = 0
        end = min(BATCH_SIZE, len(cnk))  # İlk BATCH_SIZE'ı hesapla.
        count = 0
        while start < len(cnk):
            for i in tqdm(range(start, end)): 
                count += 1
                batch_data[0].append(np.int64(count))
                batch_data[1].append(np.int32(cnk[i].metadata['id']))
                batch_data[2].append(cnk[i].metadata['title'])
                batch_data[3].append(cnk[i].page_content)
                batch_data[4].append(cnk[i].metadata['source']) 
         
            embeddings = embed([text for text in batch_data[3]]) # liste olarak gönder
            batch_data[5].extend(embeddings)

            collection.insert(batch_data)

            batch_data = [[], [], [], [], [], []]

            start = end
            end = min(start + BATCH_SIZE, len(cnk))
            print("End of batch, next start index: ", start)
      
            
            print("Waiting for 6 seconds...")
            time.sleep(6)
           

    except openai.RateLimitError as e:
        print("Rate limit exceeded. Retrying after 9 seconds...")
        time.sleep(9)
        return insert_chunks(chunks, embed, collection)  

In [None]:
from openai import AzureOpenAI

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI( 
    api_version="2023-03-15-preview" , 
    api_key="08430e02fcde44b299549ee5c650cd76",
    azure_endpoint="https://softtech-openai-ynt.openai.azure.com",
)

In [None]:
def generate_embeddings(texts):
    return [client.embeddings.create(input=[text], model="text-embedding-ada-002").data[0].embedding for text in texts]


In [None]:
insert_chunks(chunks, generate_embeddings, collection)

In [None]:
collection.flush()
collection.load()

In [None]:
##################
# Query
##################
import textwrap

QUERY_PARAM = {
    "metric_type": "L2",
    "params": {"ef": 64},
}


def search(queries):
    top_k = 5
    if type(queries) != list:
        queries = [queries]
    res = collection.search(generate_embeddings(queries), anns_field='embedding', param=QUERY_PARAM, limit=top_k,
                            output_fields=['page_content'])
    return res

In [None]:
print(search("Sonar"))