In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain_huggingface import HuggingFaceEmbeddings # Importing Huggingface embeddings from Langchain
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint # Import Huggingface Chat models from Langchain 
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Import Chroma vector store from Langchain
# from langchain_chroma import Chroma # Import Chroma vector store from Langchain
from langchain_core.prompts import ChatPromptTemplate # Import Chat Prompt Template from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations
import getpass # Importing getpass module to transfer API Key

In [2]:
# Enter your API Key from HuggingFace
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your Hugging Face API key: ")

In [3]:
# Directory to pdf files:
DATA_PATH = 'data'
def load_documents():
    """
    Load PDF Docs from specified directory in DATA_PATH
    return:
    Loaded PDF represented as Langchain Document objects
    """

    # Initialize PDF loader with specified directory
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    # Load PDF Docs and return them as a list of Document objects
    return document_loader.load()

documents = load_documents()
# Inspect contents of the first document as well as the metadata
print(documents[0])

page_content='Einsatz von kleinen Sprachmodellen zur
Detektion von böswilligen Prompts
Ciro Vincenzo Cascone
Bachelor-Thesis
zur Erlangung des akademischen Grades Bachelor of Science (B.Sc.)
Studiengang Informatik
Fakultät für Informatik
Hochschule Mannheim
14.10.2024
Betreuer
Prof. Dr. Jörn Fischer, Hochschule Mannheim
Prof. Dr. rer. nat. Kai Eckert, Hochschule Mannheim
' metadata={'source': 'data\\Bachelorarbeit_CiroCascone_2023392_IB8.pdf', 'page': 0, 'page_label': ''}


In [4]:
def split_text_into_chunks(documents: list[Document]):
    """
    Split text content of given list of Documents into smaller chunks
    args:
    document (list[Document]): List of Document objects containing text content
    return:
    list[Document]: List of Document objects representing the split chunks
    """

    # Initialize text splitter with following parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300, # Size of each chunk in characters
        chunk_overlap=100, # Overlap between consecutive chunks
        length_function=len, # Function to compute length of given text
        add_start_index=True # Flag to add start index to each chunk
    )

    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks")

    # Print example of page content and metadata for a chunk
    document = chunks[0]
    print(f"Example of chunk: \n {document.page_content} \n \n {document.metadata}")

    return chunks

In [5]:
# Path to directory to save Chroma database
CHROMA_PATH = "chroma"

def save_to_chroma(chunks: list[Document]):
    """
    Save a given list of Document objects to the Chroma database.
    args:
    chunks (list[Document]): List of Document objects representing text chunks to save
    returns:
    None
    """

    # Clear out database directory if it already exists
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create new Chroma database from the documents provided
    db = Chroma.from_documents(
        chunks,
        HuggingFaceEmbeddings(),
        persist_directory = CHROMA_PATH
    )

    # Persist the database to disk
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [15]:
def generate_vector_database():
    """
    Function to generate vector database in Chroma from provided documents
    """
    documents = load_documents()
    chunks = split_text_into_chunks(documents)
    save_to_chroma(chunks)

# Load environment variables from .env file
load_dotenv()
# Generate the vector database
generate_vector_database()

Split 72 documents into 698 chunks
Example of chunk: 
 Einsatz von kleinen Sprachmodellen zur
Detektion von böswilligen Prompts
Ciro Vincenzo Cascone
Bachelor-Thesis
zur Erlangung des akademischen Grades Bachelor of Science (B.Sc.)
Studiengang Informatik
Fakultät für Informatik
Hochschule Mannheim
14.10.2024
Betreuer 
 
 {'source': 'data\\Bachelorarbeit_CiroCascone_2023392_IB8.pdf', 'page': 0, 'page_label': '', 'start_index': 0}
Saved 698 chunks to chroma.


  db.persist()


In [6]:
print("Enter prompt: ")
query_text = input()

Enter prompt: 


In [7]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - - 
Answer the question based on the above context: {question}
"""

In [8]:
def query_rag(query_text):
    """
    Query a RAG system using Chroma database and HuggingFace
    args:
    query_text(str): The text to query the RAG system with
    returns:
    formatted_response(str): Formatted response including the generated text
    response_text(str): The generated response text
    """
    # Use the same embedding function as before when creating a Chroma database
    embedding_function = HuggingFaceEmbeddings()

    # Prepare database
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Retrieving the context from the DB using similiarity search
    results = db.similarity_search_with_relevance_scores(query_text, k=3)

    # Combine context from matching documents
    context_text = "\n\n - - \n\n".join([doc.page_content for doc, _score in results])

    # Create prompt template using context and query text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Initialize HuggingFace chat model deepseek
    llm = HuggingFaceEndpoint(
    repo_id="microsoft/Phi-3-mini-4k-instruct",
    task="text-generation",
    max_new_tokens=1024,
    do_sample=False,
    repetition_penalty=1.03,
    )

    model = ChatHuggingFace(llm=llm)

    # Generate response text from given chat model based on the prompt
    response_text = model.predict(prompt)

    # Get sources of the matching documents
    sources = [doc.metadata.get("source", None) for doc, _score in results]

    # Format and return response including generated text and sources
    formatted_response = f"Response: {response_text}\nSources: {sources}"

    return formatted_response, response_text

formatted_response, response_text = query_rag(query_text)
print(response_text)



  from .autonotebook import tqdm as notebook_tqdm
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
  response_text = model.predict(prompt)


Bruder, ein neuronales Netz, kurz auch ein neuronales Netzwerk, ist eine Art künstliche Intelligenz, die Fähigkeiten wie Lernen und Entscheidungsprozesse nachahmen kann. Im Kontext der überarbeiteten Fragestellung ist ein neuronales Netzwerk eine Methode im Bereich der künstlichen Intelligenz, welche dazu genutzt wird, Muster in Daten zu erkennen und Vorhersagen oder Entscheidungen aus diesen Mustern zu treffen.

Neuronale Netze ähneln dem menschlichen Gehirn in ihrer Struktur. Sie bestehen aus mehreren Verarbeitungsschichten - sogenannten Neuronen -, die Informationen durchgehen. Dieser Datenfluss ist einfacher als in lebenden Organismen, weil es sich um eine technische Imitation bezüglich Funktionsweise handelt und keine komplexen neuronalen Prozesse simuliert wird.

Im Steam von überwachtem Lernen, wie erwähnt in der gegebenen Zusammenfassung, werden neuronale Netze trainiert. Dies bedeutet, dass man mit markierten Datensätzen zugunsten einer bestimmten Zielfunktion lernt – beispiel