In [1]:
# Setting up async IO handling in notebook to prevent runtime errors with async operations
import nest_asyncio
nest_asyncio.apply()

In [2]:
# setting up vector database client

import qdrant_client

collection_name="demo1"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

In [3]:
# Loading Documents

from llama_index.core import SimpleDirectoryReader

input_dir_path = './'

loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

In [4]:
# visualizing document
docs[0]

Document(id_='abdab856-65a8-4686-b6e3-d6393e076e97', embedding=None, metadata={'page_label': '1', 'file_name': 'fraud_scenarios.pdf', 'file_path': 'D:\\Personal Folder\\Vijender Project\\cyber-fraud-detection-chatbot\\fraud_scenarios.pdf', 'file_type': 'application/pdf', 'file_size': 45982, 'creation_date': '2025-02-10', 'last_modified_date': '2025-02-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="Scenario  1:  I  received  an  email  stating  that  I  won  a  lottery.  I  am  being  asked  to  provide  \ndocuments.\n Remediation:  Do  not  respond  to  the  email  or  share  any  documents.  This  is  a  classic  lottery  \nscam\n 

In [5]:
"""Creating index using new documents"""

from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore

# initializing embeding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                  trust_remote_code=True)

# initializing node_parser deciding how data chunks would be created
node_parser = SimpleNodeParser.from_defaults(
    chunk_size=512,
    chunk_overlap=128
)

# setting up embeding model and node_parser
Settings.embed_model = embed_model
Settings.node_parser = node_parser

# creating index for similarity search from documents
def create_index(documents):
    vector_store = QdrantVectorStore(client=client, collection_name=collection_name)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents,
                                          storage_context=storage_context,
                                           )
    
    return index
    
index = create_index(docs)


In [6]:
"""creating index using existing collection"""
"""
from llama_index.core import Settings, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore

# initializing embeding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                  trust_remote_code=True)

# setting up embeding model and node_parser
Settings.embed_model = embed_model


# creating index for similarity search from existing collection

def index_from_vector_store(vector_store):
    
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store                                      
                                           )
    return index


vector_store = QdrantVectorStore(client=client, collection_name=collection_name)    
index = index_from_vector_store(vector_store)
"""

'\nfrom llama_index.core import Settings, VectorStoreIndex\nfrom llama_index.embeddings.huggingface import HuggingFaceEmbedding\nfrom llama_index.vector_stores.qdrant import QdrantVectorStore\n\n# initializing embeding model\nembed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",\n                                  trust_remote_code=True)\n\n# setting up embeding model and node_parser\nSettings.embed_model = embed_model\n\n\n# creating index for similarity search from existing collection\n\ndef index_from_vector_store(vector_store):\n    \n    index = VectorStoreIndex.from_vector_store(vector_store=vector_store                                      \n                                           )\n    return index\n\n\nvector_store = QdrantVectorStore(client=client, collection_name=collection_name)    \nindex = index_from_vector_store(vector_store)\n'

In [7]:
# initializing and setting up LLM to use

from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [8]:
# setting up query engine

# from llama_index.core import get_response_synthesizer
from llama_index.core.postprocessor import SentenceTransformerRerank

# reranking initially retrieved chunks/vectors based on relevancy
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)
# retrieving top_k similar chunks/vectors
query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[rerank]
)

In [9]:
# creating prompt template for context and query

from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Based on the context above, analyze the query and provide the response in the following format:
              
              Scenario: [Describe the situation from matching context]
              Remediation: [Provide specific prevention/remediation steps]
              Points of contact: [List relevant contact information/helplines]
              
              If no relevant information is found in the context, respond with "No matching scenario found."
              
              Query: {query_str}
              
              Response:"""

qa_prompt_tmpl = PromptTemplate(template)

# Update query engine with new template
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

In [10]:
# Input query and save response
response = query_engine.query("Received job offer requiring payment for registration/training. What are the points of Contact?")

In [11]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

Scenario: Received job offer requiring payment for registration/training.

Remediation: Legitimate employers don't ask for money. Research company thoroughly before proceeding.

Points of contact: Report to Ministry of Labour & Employment portal (labour.gov.in)

In [12]:
# retrieving relevant chunks for the response
def print_source_nodes(response):
    source_nodes = response.source_nodes
    print("\nRetrieved chunks:")
    for i, node in enumerate(source_nodes):
        print(f"\nChunk {i+1}:")
        print(node.text)

print_source_nodes(response)


Retrieved chunks:

Chunk 1:
Remediation:  Avoid  unrealistic  investment  schemes.  Verify  with  SEBI  registered  advisors.  Points  of  contact:  SEBI  toll  free  helpline  1800  22  7575   Scenario  5:  Someone  called  pretending  to  be  tech  support  saying  my  computer  is  infected.  Remediation:  Don't  allow  remote  access  to  your  computer.  Legitimate  tech  support  won't  call  
unsolicited.
 Points  of  contact:  File  FIR  at  local  police  station  and  report  to  cert-in.org.in   Scenario  6:  Received  job  offer  requiring  payment  for  registration/training.  Remediation:  Legitimate  employers  don't  ask  for  money.  Research  company  thoroughly  before  
proceeding.
 Points  of  contact:  Report  to  Ministry  of  Labour  &  Employment  portal  (labour.gov.in)   Scenario  7:  Dating  profile  asking  to  transfer  money  for  emergency/travel  expenses.  Remediation:  Never  send  money  to  online  romantic  interests.  These  are  romance  scams. 