In [1]:
# Step 1: Set Up the Environment
# Install required libraries for LangChain, vector stores, and Hugging Face integration
!pip install -U langchain langchain-core langchain-huggingface langchain_community faiss-cpu huggingface_hub

# Import necessary modules
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
import textwrap
import requests  # Used implicitly in HuggingFaceCustomEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from typing import List
from langchain.embeddings import HuggingFaceEmbeddings



In [2]:
# Step 2: Define the Scenario
# This section provides a futuristic cyberpunk case study about a ransomware attack investigation.
# The document serves as a knowledge base for answering questions about cybercrime and forensics.

# Possible Questions the Paragraph Can Answer:
# 1. What type of cyberattack did Detective Y investigate?
    # (Expected Response: Ransomware attack)
# 2. What was the victim's profession?
    # (Expected Response: Robotics engineer)
# 3. Where was the remote server located that ultimately led to the perpetrator's arrest?
    # (Expected Response: Abandoned industrial sector of city X)

scenario_text = """
The neon lights of X shimmered, reflecting off the sleek
cybernetic implants of its citizens. Detective Y, however,
saw little of the city's beauty as he hunched over a
holographic display, a frown etched on his face. He was
facing a digital enigma: a ransomware attack unlike any
he'd encountered before. The victim, a renowned robotics
engineer named Z, reported that all his research data,
years of work on a groundbreaking AI-powered prosthetic
limb, had been encrypted. The perpetrator, a shadowy
entity calling themselves The Serpent, demanded an
exorbitant ransom in untraceable cryptocurrency. Y, a
veteran of the Cyber Crimes Division, knew that time was
of the essence. Z's research was not only invaluable
scientifically but also held the potential to revolutionize
prosthetics for millions. But the initial investigation
yielded little. The Serpent had left no digital footprints,
employing advanced encryption and anonymization techniques
to mask their identity and location. Y, however, was not
one to be easily deterred. He understood the power of
expanding the knowledge base. He requested and received
access to Z's entire digital life – his personal computers,
lab servers, cloud storage, even his smart home devices.
Y's team, equipped with cutting-edge forensic tools, began
their meticulous analysis. They reconstructed deleted
files, analyzed network traffic logs, and even delved into
the firmware of Z's smart appliances, searching for any
hidden data or unusual connections. They expanded their
search beyond Z's immediate digital sphere, examining
online forums, academic databases, and even dark web
marketplaces for any mention of the stolen research or
clues about The Serpent's identity. As the team dug deeper,
they discovered a seemingly unrelated incident: a minor
security breach at a local university's robotics lab a few
weeks prior. The breach, initially dismissed as a student
prank, involved the theft of a small, experimental AI
algorithm. Y's intuition flared. Could this be connected
to The Serpent's attack? Further investigation revealed a
startling connection. The stolen algorithm, while seemingly
insignificant on its own, was a crucial component in Z's
research. The Serpent, it seemed, had planned their attack
meticulously, acquiring the necessary tools before launching
their ransomware scheme. With this expanded knowledge base,
Y's team was able to trace The Serpent's digital trail.
They uncovered a hidden connection to a remote server
located in the abandoned industrial sector of X. A raid
on the location led to the arrest of a disgruntled former
student of Z's, seeking revenge for a perceived academic
slight. The case of The Serpent highlighted the crucial
role of expanding the knowledge base in digital forensics.
By connecting seemingly disparate pieces of information
and exploring every digital avenue, Y and his team were
able to bring a cybercriminal to justice and safeguard
groundbreaking research that held the promise of a better
future.
"""

# Display the total character count of the scenario
print("Total characters in the scenario:", len(scenario_text))

Total characters in the scenario: 3011


In [3]:
# Step 3: Split the Text into Chunks
# Use RecursiveCharacterTextSplitter to divide the text into manageable chunks for processing
chunks = RecursiveCharacterTextSplitter(
    chunk_size=400,  # Maximum size of each chunk in characters
    chunk_overlap=50,  # Overlap between chunks to preserve context
    separators=["\n\n", "\n", ". ", " ", ""],  # Order of separators to split the text
    length_function=len  # Use character count to measure chunk size
).split_text(scenario_text)

# Display chunk statistics
print(f"Total number of chunks: {len(chunks)}\n")
print(f"Length of first chunk: {len(chunks[0])}\nContent of first chunk: {chunks[0]}\n")
print(f"Length of second chunk: {len(chunks[1])}\nContent of second chunk: {chunks[1]}\n")

Total number of chunks: 9

Length of first chunk: 392
Content of first chunk: The neon lights of X shimmered, reflecting off the sleek
cybernetic implants of its citizens. Detective Y, however,
saw little of the city's beauty as he hunched over a
holographic display, a frown etched on his face. He was
facing a digital enigma: a ransomware attack unlike any
he'd encountered before. The victim, a renowned robotics
engineer named Z, reported that all his research data,

Length of second chunk: 383
Content of second chunk: years of work on a groundbreaking AI-powered prosthetic
limb, had been encrypted. The perpetrator, a shadowy
entity calling themselves The Serpent, demanded an
exorbitant ransom in untraceable cryptocurrency. Y, a
veteran of the Cyber Crimes Division, knew that time was
of the essence. Z's research was not only invaluable
scientifically but also held the potential to revolutionize



In [4]:
# Step 4: Define Custom Embeddings Class
class HuggingFaceCustomEmbeddings(Embeddings):
    """Custom embeddings class using Hugging Face Inference API."""

    def __init__(self, api_url: str, api_token: str):
        """Initialize with API URL and token."""
        self.api_url = api_url
        self.api_token = api_token

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for multiple text documents."""
        headers = {"Authorization": f"Bearer {self.api_token}"}
        payload = {"inputs": texts, "options": {"wait_for_model": True}}
        response = requests.post(self.api_url, headers=headers, json=payload)

        if response.status_code == 200:
            return response.json()
        else:
            print(f"Request failed with status code {response.status_code}")
            print(f"Error: {response.text}")
            return []

    def embed_query(self, text: str) -> List[float]:
        """Generate embedding for a single query text."""
        return self.embed_documents([text])[0]

In [5]:
# Step 5: Set Up Embeddings and Vector Store
# Define Hugging Face Model URL and access token (e.g., API Token)
HF_API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/paraphrase-MiniLM-L6-v2"
# apply for your Huggin Face access token here https://huggingface.co/docs/hub/en/security-tokens
# Your access token should look like this
api_token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Create embedding function instance
embedding_function = HuggingFaceCustomEmbeddings(api_url=HF_API_URL, api_token=api_token)

# Generate embeddings for the text chunks
embeddings = embedding_function.embed_documents(chunks)
print(f"Size of embeddings for first chunk: {len(embeddings[0])}")
print(f"First 3 values of first embedding: {embeddings[0][0:3]}")

Size of embeddings for first chunk: 384
First 3 values of first embedding: [-0.48097720742225647, 0.3812538683414459, -0.2985522449016571]


In [6]:
# Step 6: Set and Test the Vector Store
# Create FAISS vector store from text and embeddings
vectorstore = FAISS.from_embeddings(
    text_embeddings=list(zip(chunks, embeddings)),
    embedding=embedding_function  # Embedding function for queries
)

# Perform a similarity search on the vector store with a sample query
query = "What type of cyberattack did Detective Y investigate?"
results = vectorstore.similarity_search(query, k=1)  # Return top 1 result
print(f"Most similar chunk:\n{results[0].page_content}")

Most similar chunk:
The neon lights of X shimmered, reflecting off the sleek
cybernetic implants of its citizens. Detective Y, however,
saw little of the city's beauty as he hunched over a
holographic display, a frown etched on his face. He was
facing a digital enigma: a ransomware attack unlike any
he'd encountered before. The victim, a renowned robotics
engineer named Z, reported that all his research data,


### Explaination:
When you search (vectorstore.similarity_search), the query text is embedded using embedding_function.embed_query, and FAISS finds the closest stored embeddings.

*   embed_query: For embedding a single text (used during search).
*   return the top 1 (k=1) most similar result.

Retrieval alone (via FAISS) returns raw chunks, which may be fragmented or verbose, potentially including multiple sentences unrelated to the query’s focus. This is because Retrieval alone finds similar text only based on
embeddings,
*   it doesn’t “understand” the question or rephrase the answer

### Solution: Use LLM

*    LLMs excel at natural language comprehension and generation


### Role of LLM:

The LLM interprets the query and retrieved chunks, extracting key details (e.g., "ransomware attack") even if the raw chunks don’t directly phrase it as "type of cyberattack." This showcases the LLM’s ability to comprehend and rephrase naturally, beyond raw similarity matching.

In [7]:
# Step 7: Import Libraries and Suppress Warnings
# Import required modules and suppress specific FutureWarning from Hugging Face Hub
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.utils._deprecation")
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
import textwrap

# Step 8: Initialize the Language Model
# Set up the Hugging Face LLM endpoint with specified parameters
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",  # Model repository ID
    huggingfacehub_api_token=api_token,            # API token for authentication (assumed defined earlier)
    temperature=0.1,                               # Low temperature for deterministic output
    task="text-generation"                         # Task type for the model
)

# Step 9: Create the Retrieval-Augmented Generation (RAG) Chain
# Build a RetrievalQA chain combining vector retrieval and LLM generation
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,                                       # Language model for interpreting and generating responses
    retriever=vectorstore.as_retriever(            # Convert vectorstore to retriever (assumed defined earlier)
        search_type="similarity",                  # Use similarity-based search for retrieval
        search_kwargs={"k": 4}                     # Retrieve top 4 relevant chunks based on embeddings
    ),
    chain_type="stuff"                             # Combine retrieved documents into a single prompt for LLM
)

In [8]:
# Step 10: Execute Query and Display Response
# Run a query through the RAG chain and format the response
query = "What type of cyberattack did Detective Y investigate?"
response = qa_chain.invoke(query)                  # Get response combining retrieval and LLM processing
wrapped_response = textwrap.fill(response["result"], width=160)  # Wrap text for clean output
print(f"\n🔵 RAG's Answer: {wrapped_response}\n")


🔵 RAG's Answer:  Detective Y investigated a ransomware attack.



### Oberservation
You’d just get raw retrieved chunks, which might be fragmented or verbose (e.g., multiple sentences from your story)

This is because Retrieval alone (via FAISS) finds similar text based on embeddings, but it doesn’t “understand” the question or rephrase the answer—LLMs excel at natural language comprehension and generation.

### Role of LLM
The LLM interprets the query (“What type of cyberattack...”) and the retrieved context, extracting the relevant detail (“ransomware attack”) even if the chunks don’t explicitly say “type of cyberattack.”

In [9]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.utils._deprecation")
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA

# Define the LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",
    huggingfacehub_api_token=api_token,
    temperature=0.1,
    task="text-generation"
)

# Create the RetrievalQA chain (assuming vectorstore is defined)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4}),
    chain_type="stuff"
)

# Run the query
query = "What type of cyberattack did Detective Y investigate?"
response = qa_chain.invoke(query)
print(f"\n🔵 RAG's Answer: {response}\n")


🔵 RAG's Answer: {'query': 'What type of cyberattack did Detective Y investigate?', 'result': ' Detective Y investigated a ransomware attack.'}



# **TEST QUESTIONS:**

## In-Text Questions:

1. What type of cyberattack did Detective Y investigate? (Expected Response: Ransomware attack)
2. What was the victim's profession? (Expected Response: Robotics engineer)
3. Where was the remote server located that ultimately led to the perpetrator's arrest? (Expected Response: Abandoned industrial sector of city X)


## Out-of-Text Questions:
1. What specific encryption algorithm did The Serpent use to encrypt the research data? (Expected Response: The story doesn't mention the specific algorithm.)
2. What was the name of the university where the minor security breach occurred? (Expected Response: The story doesn't mention the university's name.)
3. Did Detective Y's team collaborate with any external cybersecurity experts or organizations during the investigation? (Expected Response: The story doesn't mention any external collaboration.)