# Import libraries

In [1]:
import pandas as pd
import chromadb
import re
import json
from langchain.embeddings import VertexAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema.vectorstore import VectorStoreRetriever
from vertexai.language_models import TextGenerationModel
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser

2024-05-14 16:35:05.216046: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Read files

In [2]:
extracted_df = pd.read_csv('chunked_data.csv')
extracted_df = extracted_df.drop(['Unnamed: 0'],axis=1)

In [3]:
extracted_df.head(1)

Unnamed: 0,chunks,len,ref_link
0,ai-driven security operations (soc) | fortinet...,499,{'source': 'https://www.fortinet.com/fortiguar...


In [4]:
# extracted_df.describe([x/10 for x in range(0,10)])

In [5]:
docs = extracted_df['chunks']
metadatas = extracted_df['ref_link']
metadatas_dict=[]
for i in  metadatas:
    i = i.replace("'", "\"")
    dict_obj = json.loads(i)
    metadatas_dict.append(dict_obj)
len(docs), len(metadatas)

(147083, 147083)

In [6]:
docs = docs[:5000]
metadatas_dict = metadatas_dict[:5000]
collection_name = "QnA_5000_char"

# Create embeddings

In [7]:
client = chromadb.PersistentClient(path="Chroma")
try:
    collection = client.create_collection(name=collection_name)
except:
    client.delete_collection(name = collection_name)
    collection = client.create_collection(name=collection_name)
    
embeddings = VertexAIEmbeddings()
new_vector_store = Chroma.from_texts(texts = list(docs),
                            embedding = embeddings,
                            metadatas=metadatas_dict,
                            collection_name=collection_name,
                            ids=[str(x) for x in range(len(docs))],
                            persist_directory='Chroma/', client = client)
print('DONE')

DONE


In [8]:
# question = 'Which is the operating system using which Fortinet delivers centralized investigation and remediation?'
# response = new_vector_store.search(query = question,search_type = "similarity")

# Verify the collection

In [9]:
# client.delete_collection(name = "QnA_embedding_500_char_chunk")

In [10]:
client = chromadb.PersistentClient(path="Chroma")
emb_fn = VertexAIEmbeddings()
new_vector_store = Chroma(embedding_function=emb_fn)
collection = client.get_collection(name='QnA_5000_char', embedding_function=emb_fn)
print(len(collection.get()['ids']))
print()
client.list_collections()

5000



[Collection(name=QnA_5000_char), Collection(name=QnA_4000_char)]

In [11]:
client.list_collections()

[Collection(name=QnA_5000_char), Collection(name=QnA_4000_char)]

# Get embeddings

In [13]:
questions = ["Which is the operating system using which Fortinet delivers centralized investigation?",
"What are the benfits/Features of Fortinet's Security Operations Solution?",
"Which is the competitor of Fortinet which provides similar solution as Fortinet's Security Operations Solution?",
"Which are the products in which FortAI has been integrated?",
"Talk about FortiAI for FortiSIEM",
"What is Fortnite",
"Give FortAI's benefits",
"What is FortiGuard URL Filtering Service?",
"What is not a Use case of Fortiguard URL Filtering Service?",
"Does Fortiguard URL Filtering Service help in blocking malicious downloads?"]

In [14]:
from vertexai.language_models import TextEmbeddingModel
def text_embedding(data) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko")
    embeddings = model.get_embeddings(data)
    for embedding in embeddings:
        vector = embedding.values
    return vector

def retreive_module(question,number_of_documents):
        client = chromadb.PersistentClient(path="Chroma")
        emb_fn = VertexAIEmbeddings()
        collection = client.get_collection(name=collection_name, embedding_function=emb_fn)
        retrieved_docs = collection.query(query_embeddings=text_embedding([question]), n_results=number_of_documents)
        return retrieved_docs

In [19]:
docs_content = retreive_module(questions[0],number_of_documents=5)
docs_content['documents']

[['the common operating system for all fortigate advanced network and security functions spanning on-premises and cloud environments. and because every solution running on the fortigate platform runs on fortios, they are not simply integrated. they are the same product., "this common codebase enables true convergence across all solutions and form factors (appliance, virtual machine, container, and cloud). and fortinets unique asic acceleration, designed for both physical and virtual devices',
  'the fortinet security fabric platform, fortinet supports customers with a platform approach to cybersecurity via the fortinet security fabric, which converges networking and security through one operating system (fortios), one unified agent (forticlient), one management console (fortimanager), and one data lake (fortianalyzer) to integrate and protect the entire digital attack surface. it’s focused on three major enterprise pillars: secure networking, unified sase',
  'network security solution

In [26]:
# Initialize the VertexAI model
def generate_response_from_docs(docs_content,question,chat_history = ""):
    model = VertexAI(model_name="text-bison-32k", max_output_tokens=1024, temperature=0.2)
    query_text = """
You are expert at answering questions regarding  Fortinet.
Your role is to give to-the-point and crisp answer to the user question in a polite and professional manner.

Use the below context and chat history to answer the question, if the questions is irrelevant , say "i dont know, I am not aware it"
"""
    prompt = query_text

    template = f"""
    Task:
    {prompt}

    conversation_history:
    {chat_history}

    Context: 
    {docs_content}

    Question: {question}
    Answer:
    """
    prompt_template = PromptTemplate(
        input_variables=["content"],
        template = template)
    print(template
    # Prepare the input for the model
    input_content = {"content": docs_content}
    # Use the prompt template to generate a response
    response = prompt_template | model | StrOutputParser()
    return response.invoke(input_content)


In [25]:
responses = []
for question in questions:
    docs_content = retreive_module(question,number_of_documents=5)
    ids = docs_content['ids']
    distances = docs_content['distances'][0]
    metadatas = docs_content['metadatas']
    documents = docs_content['documents']
    print(len(documents[0]))
    response = generate_response_from_docs('\n'.join(documents[0]),question)
    print(question)
    responses.append(response)
    print(response)
    print("="*100)
    print()
    break


5


NameError: name 'template' is not defined

In [17]:
import pandas as pd
df = pd.DataFrame()
df['questions']=questions
df['responses']=responses
df.to_csv('predict.csv')

In [18]:
df

Unnamed: 0,questions,responses
0,Which is the operating system using which Fort...,FortiOS is the operating system used by Forti...
1,What are the benfits/Features of Fortinet's Se...,The benefits of Fortinet's Security Operation...
2,Which is the competitor of Fortinet which prov...,I do not have information about Fortinet's co...
3,Which are the products in which FortAI has bee...,The products in which FortAI has been integra...
4,Talk about FortiAI for FortiSIEM,FortiAI is a unique AI assistant that leverag...
5,What is Fortnite,"I'm sorry, but the context provided does not ..."
6,Give FortAI's benefits,"FortiAI offers several benefits, including:\n..."
7,What is FortiGuard URL Filtering Service?,FortiGuard URL Filtering Service provides com...
8,What is not a Use case of Fortiguard URL Filte...,DNS prevention across the fabric is not a use...
9,Does Fortiguard URL Filtering Service help in ...,"Yes, Fortiguard URL Filtering Service helps i..."
