# test Blob Storage

In [None]:
from BlobStorage import AzureBlobStorageDownloader

azure_blob_storage = AzureBlobStorageDownloader()
azure_blob_storage.download_all_blobs_in_container()

# test Azure Search

In [None]:
from AzureSearch import AzureSearchService

azure_search_service = AzureSearchService()
azure_search_service.create_search_index()

# test Document Class

In [None]:
from Document import PDFDocumentProcessor

# Usage
source = "C:\\Users\\gciprianherrera\\Desktop\\LLM\\MVP_Chatbot\\PDF\\DEG\\ELH-2020-033633.pdf"
# source = "C:\\Users\\gciprianherrera\\Desktop\\LLM\\MVP_Chatbot\\PDF\\HAPF\\ELH-2021-034464.pdf"
processor = PDFDocumentProcessor(source)
document = processor.process_document()
document



# test DatasetBuilder class

In [1]:
from Dataset import DatasetBuilder

start_directory = "C:\\Users\\gciprianherrera\\Desktop\\LLM\\MVP_Chatbot\\PDF"
pdf_processor = DatasetBuilder(start_directory)
data = pdf_processor.process_directory()[:9]

# test Chunker 

In [None]:
from Chunker import DatasetChunker

processor = DatasetChunker()
result_chunking_data = processor.process_dataset(data)

In [2]:
from uuid import uuid4
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from EmbeddingsAzure import DocumentEmbedderAzure

embedder = DocumentEmbedderAzure()
model_name = 'text-embedding-ada-002'
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=tiktoken_len,
    separators=["\n", " ", ""])

texts = {}
metadatas = []
batch_limit = 100

for i, document in data.iterrows():
    metadata = {
        'Title': document['Title'],
        'Version': document['Version'],
        'Date': document['Date'],
        'Confidentiality': document['Confidentiality'],
        'Perimeter': document['Perimeter'],
        'Investigation Number': document['Investigation Number'],
        'Source': document['Source']
    }
    
    record_texts = text_splitter.split_text(document['page_content'])
    
    ids = [str(uuid4()) for _ in range(len(record_texts))]
    
    for k, text in enumerate(record_texts):
        texts[ids[k]] = text

    record_metadatas = [{
                    "chunk": j, "id": ids[j], "page_content": text,  **metadata
                } for j, text in enumerate(record_texts)]

    metadatas.extend(record_metadatas)
    
print(f"Total number of chunks: {len(metadatas)}")
    

Total number of chunks: 301


In [None]:
# remove the column chunks from the dictionary
for i, metadata in enumerate(metadatas):
    metadatas[i] = {k: v for k, v in metadata.items() if k != 'chunk'}
    
for i, metadata in enumerate(metadatas):
    metadatas[i] = {k: v for k, v in metadata.items() if k != 'Confidentiality'}
    
for i, metadata in enumerate(metadatas):
    metadatas[i] = {k: v for k, v in metadata.items() if k != 'Investigation Number'}
    
metadatas

In [41]:
metadatas[0]

{'id': '09d62a0d-e809-4e5f-835c-d096380a909c',
 'page_content': 'TABLES DES MATIERES\n1\nDOCUMENTS DE REFERENCE..... 2\n2\nOBJECTIF..... 2\n3\nDESCRIPTION DE LA ZONE D’INTERVENTION ..... 3\n4\nDESCRIPTION DE LA CELLULE 909..... 4\n5\nOPERATIONS PREVUES..... 6\n6\nOPERATIONS REALISEES..... 6\n7\nRESULTATS OBTENUS..... 6\n7.1\nINSPECTION VIDEO ..... 7',
 'Title': 'Investigation de la cellule 909 Dégainage',
 'Version': '2.0',
 'Date': '07/05/2019',
 'Perimeter': 'DEG',
 'Source': 'C:\\Users\\gciprianherrera\\Desktop\\LLM\\MVP_Chatbot\\PDF\\DEG\\ELH-2018-022986.pdf'}

In [3]:
# save  result_chunking_data into a json file 
import json
with open('result_chunking_data_test.json', 'w', encoding="utf-8") as f:
    json.dump(metadatas, f, indent=4, ensure_ascii=False)

In [32]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    # ComplexField,
    # CorsOptions,
    SearchIndex,   
    # ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
    )
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
import os
import uuid
from tqdm import tqdm

In [33]:

AZURE_AI_SEARCH_API_KEY=os.getenv("AZURE_AI_SEARCH_API_KEY")
AZURE_AI_SEARCH_ENDPOINT=os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_DEPLOYEMENT_ID=os.getenv("AZURE_AI_SEARCH_DEPLOYEMENT_ID")
AZURE_AI_SEARCH_INDEX_NAME=os.getenv("AZURE_AI_SEARCH_INDEX_NAME")
AZURE_AI_SEARCH_INDEXER_NAME=os.getenv("AZURE_AI_SEARCH_INDEXER_NAME")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")


index_client = SearchIndexClient(AZURE_AI_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))
index_name = AZURE_AI_SEARCH_INDEX_NAME
index = SearchIndex(
    name = index_name,
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="page_content", type=SearchFieldDataType.String, analyzer_name="fr.lucene"),
        # SearchableField(name="chunk", type=SearchFieldDataType.String),
        SearchField(name="embeddings", 
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                    searchable = True,
                    vector_search_dimensions=1536,
                    vector_search_profile_name="my-vector-config"),
        SearchableField(name="Perimeter", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="Title", type=SearchFieldDataType.String, analyzer_name="fr.lucene"),
        SearchableField(name="Version", type=SearchFieldDataType.String, sortable=True, filterable=True),
        # SearchableField(name="Investigation Number", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="Date", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="Source", type=SearchFieldDataType.String, sortable=True, filterable=True),
    ],
    vector_search = VectorSearch(
        profiles=[VectorSearchProfile(name="my-vector-config", algorithm_configuration_name="my-algorithms-config")],
        algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config",
                                               parameters={"distanceMeasure": "Cosine"})]
    )
    )

index_client.delete_index(index)
index_client.create_index(index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x1d6321e4ce0>

In [None]:
api_key = os.getenv("AZURE_AI_SEARCH_API_KEY")
endpoint = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME")
search_client = SearchClient(endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
# for i in range(0, len(metadatas),100):
#     result = search_client.upload_documents(metadatas[i:i+100])
    
search_client.upload_documents(metadatas)

In [42]:
import os
import uuid
from openai import AzureOpenAI
from dotenv import load_dotenv
import azure.identity
load_dotenv()
        
        # Initialize the AzureOpenAI client
embedder_model = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMB_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMB_API_KEY"),
    api_version="2023-05-15",
    azure_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT_NAME")
)

def get_embeddings(text):
    # Generate embeddings for the provided text
    embeddings = embedder_model.embeddings.create(
        input=text,
        # model="text-embedding-3-large",
        model="text-embedding-ada-002"
    )
    return embeddings.data[0].embedding

In [None]:


AZURE_SEARCH_FULL_INDEX = AZURE_AI_SEARCH_INDEX_NAME
search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_AI_SEARCH_API_KEY), AZURE_SEARCH_FULL_INDEX)

search_query = "learning about underwater activities"
search_vector = get_embeddings(search_query)

r = search_client.search(search_text=None, vector_queries=[
    VectorizedQuery(vector=search_vector, k_nearest_neighbors=3, fields="embeddings")])
for doc in r:
    print(f"id: {doc['id']}, score: {doc['@search.score']}")


# test Embeddings

In [None]:
# from Embeddings import DocumentEmbedder
from EmbeddingsAzure import DocumentEmbedderAzure
from AzureSearch import AzureSearchService


embedder = DocumentEmbedderAzure()
# Generate embeddings for the sample data
docs_with_embeddings = embedder.create_document_embeddings(metadatas)
print(f"Total number of embeddings: {len(docs_with_embeddings)}")

In [4]:
# save  docs_with_embeddings into a json file 
import json
with open('docs_with_embeddings_tiktoken_chunker.json', 'w', encoding="utf-8") as f:
    json.dump(docs_with_embeddings, f, indent=4, ensure_ascii=False)

In [7]:
# read the json file
import json
with open('docs_with_embeddings.json', 'r', encoding="utf-8") as f:
    docs_with_embeddings = json.load(f)

In [7]:
from AzureSearch import AzureSearchService
azure_search_service = AzureSearchService()

for i in range(0, len(docs_with_embeddings),100):
    result = azure_search_service.upload_documents(docs_with_embeddings[i:i+100])
    

NameError: name 'docs_with_embeddings' is not defined

In [None]:
from azure.search.documents.models import VectorizedQuery
from azure.search.documents import SearchClient
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient


api_key = os.getenv("AZURE_AI_SEARCH_API_KEY")
endpoint = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME")
indexer_name = os.getenv("AZURE_AI_SEARCH_INDEXER_NAME")
openai_api_key = os.getenv("OPENAI_API_KEY")
index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
search_client = SearchClient(endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))

search_query = "quel est l'interet de la celulle 909 DEG"
# search_query = "quel est l' interet de l' INVESTIGATION DE LA CELLULE 926 DU DEGAINAGE"
search_vector = DocumentEmbedderAzure().get_embeddings(search_query)

vector_query = VectorizedQuery(
    vector=search_vector,
    k_nearest_neighbors=20,
    fields="embeddings"
)

results = search_client.search(
    vector_queries=[vector_query],
    # select=["id"]
    select=["page_content"]
)

# input_text = []
input_text = ''
try:
    while True:
        result = results.next()
        # input_text.append(result)
        input_text += result["page_content"] + "\n\n "
except StopIteration:
    pass

print(input_text)





In [30]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv

completion_client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
)

llm = completion_client.completions.create(
    model="gpt-35-turbo",
    prompt=prompt,
    temperature=0,
    max_tokens=1000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)


In [None]:
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

search_query = "quel est l'interet de la celulle 909 DEG"

prompt = f"""En utilisant le context suivant : {input_text}\n 
il faut repondre a la question suivante: {search_query}"""

# print(prompt)

def get_azure_openai_response(prompt):
    # Load environment variables
    load_dotenv()

    # Initialize the AzureOpenAI client
    completion_client = AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
    )

    # Get response from the model
    response = completion_client.completions.create(
        model="gpt-35-turbo",
        prompt=prompt,
        temperature=0,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    # Return the text of the first choice
    response = response.choices[0].text.strip()
    print(f"{response}\n\n\n")
    return response

get_azure_openai_response(prompt)

# test queries

In [None]:
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
load_dotenv()


query = "quels sont les resultat de l'investigation de la cellule 904 HAO"


AZURE_AI_SEARCH_API_KEY=os.getenv("AZURE_AI_SEARCH_API_KEY")
AZURE_AI_SEARCH_ENDPOINT=os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_DEPLOYEMENT_ID=os.getenv("AZURE_AI_SEARCH_DEPLOYEMENT_ID")
AZURE_AI_SEARCH_INDEX_NAME=os.getenv("AZURE_AI_SEARCH_INDEX_NAME")
AZURE_AI_SEARCH_INDEXER_NAME=os.getenv("AZURE_AI_SEARCH_INDEXER_NAME")

search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT,index_name=AZURE_AI_SEARCH_INDEX_NAME, credential = AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))


from EmbeddingsAzure import DocumentEmbedderAzure
QueryEmbedder = DocumentEmbedderAzure()
vector_query = VectorizedQuery(vector=QueryEmbedder.get_embeddings(query),k_nearest_neighbors=5,  fields="embeddings")

results = search_client.search(
    vector_queries=[vector_query],
    select=["page_content"]
    )


input_text = []
try:
    while True: 
        result = results.next()
        input_text += result
except StopIteration:
    pass

input_text
prompt = f"""En utilisant le context suivant : {input_text}\n 
il faut repondre a la question suivante seulment en francais: {query}"""

def get_azure_openai_response(prompt):
    # Load environment variables
    load_dotenv()

    # Initialize the AzureOpenAI client
    completion_client = AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
    )

    # Get response from the model
    response = completion_client.completions.create(
        model="gpt-35-turbo",
        prompt=prompt,
        temperature=0,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    # Return the text of the first choice
    return response.choices[0].text

get_azure_openai_response(prompt)



In [None]:
from VectorQuery import AzureSearchQuery

query = "quels sont les resultat de l'investigation de la cellule 904 HAO"

azure_client = AzureSearchQuery()

# Perform a vectorized query
input_text = azure_client.vectorized_query(query)

from RAG import AzureOpenAIClient

prompt = f"""En utilisant le context suivant : {input_text}\n 
il faut repondre a la question suivante : {query} seulment en francais"""

azure_client = AzureOpenAIClient()
response = azure_client.get_response(prompt)
response

In [44]:
from ChatBot import Chatbot

chatBot = Chatbot()

# query = "quel est l'interet de la celulle 909 DEG"
query = "quels sont les resultat de l'investigation de la cellule 904 HAO"
# query = "quel est la mesure radiologique et le debit de dose de la cuve 232-20 HADE"
# query = "donne moi l'information plus importante de l'Investigation des pots 13 et 19 en cellule 919B"
# query = "qu'elles sont les fosses investiguées"
# query = "qu'elle est le volume de boue dans la fosse 217-02"
# query = "qu'elle est le volume de boue dans la fosse 217-01"
# query = "qu'elle est le document investiguée dans la zone nord ouest"

response = chatBot.ask(query)
print(response.strip('/n')) 

La cellule 904 HAO a permis de decouvrir un reseau de blanchiment d'argent et de trafic de drogue. Les membres de la cellule ont arrete plusieurs suspects et saisi des sommes importantes d'argent et de drogue. Les investigations sont en cours pour determiner l'etendue du reseau et les ramifications internationales.

                """
            },
            {
                "input": "What is the name of the cell that discovered the drug trafficking and money laundering network?",
                "output": "The name of the cell that discovered the drug trafficking and money laundering network is 904 HAO."
            },
            {
                "input": "What is the purpose of the cell 904 HAO?",
                "output": "The purpose of the cell 904 HAO is to investigate drug trafficking and money laundering networks."
            },
            {
                "input": "What did the cell 904 HAO discover?",
                "output": "The cell 904 HAO discovered a drug tra

# Langchain 

In [17]:
# import dataset data.json
from Dataset import DatasetBuilder

start_directory = "C:\\Users\\gciprianherrera\\Desktop\\LLM\\MVP_CHATBOT\\MVP_final\\blob_files"
pdf_processor = DatasetBuilder(start_directory)
data = pdf_processor.process_directory()

In [18]:
import os
import tiktoken
from tqdm import tqdm
from uuid import uuid4
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
from EmbeddingsAzure import DocumentEmbedderAzure

embedder = DocumentEmbedderAzure()
model_name = 'text-embedding-ada-002'
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=5,
    length_function=tiktoken_len,
    separators=["\n", " ", ""])

In [20]:
texts = []
metadatas = []
batch_limit = 100

for i, document in data.iterrows():
    metadata = {
        'Title': document['Title'],
        'Version': document['Version'],
        'Date': document['Date'],
        'Confidentiality': document['Confidentiality'],
        'Perimeter': document['Perimeter'],
        'Investigation Number': document['Investigation Number'],
        'Source': document['Source']
    }
    
    record_texts = text_splitter.split_text(document['page_content'])
    record_metadatas = [{
                    "chunk": j, "page_content": text, **metadata
                } for j, text in enumerate(record_texts)]

    texts.extend(record_texts)
    metadatas.extend(record_metadatas)

# Another Method

In [17]:
import os

import azure.identity
import dotenv
from openai import AzureOpenAI
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)
from azure.search.documents.models import VectorizedQuery

In [18]:
AZURE_AI_SEARCH_DEPLOYEMENT_ID = os.getenv("AZURE_AI_SEARCH_DEPLOYEMENT_ID")
AZURE_AI_SEARCH_ENDPOINT = os.getenv("AZURE_AI_SEARCH_ENDPOINT")


AZURE_OPENAI_EMB_DEPLOYMENT_ID = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT_ID")
AZURE_OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL")

In [25]:
azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
embedder_model = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMB_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMB_API_KEY"),
    api_version="2023-05-15",
    azure_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT_NAME"), 
    # azure_ad_token_provider=token_provider
    )

In [26]:
def get_embedding(text):
    get_embeddings_response = embedder_model.embeddings.create(model=AZURE_OPENAI_EMB_MODEL, input=text)
    return get_embeddings_response.data[0].embedding

In [None]:
get_embedding("hello")