### Import Statements

In [None]:
import requests
import fitz  # PyMuPDF
import re

### Data Retriaval
Download of the pdf file(s)

In [None]:
pdf_url = "https://bugs.python.org/file47781/Tutorial_EDIT.pdf"

def download_pdf(url):
    response = requests.get(url)
    with open('temp.pdf', 'wb') as f:
        f.write(response.content)
    return 'temp.pdf'

def pdf_to_text(filename):
    doc = fitz.open(filename)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def clean_text(text):
    # Remove double spaces
    return re.sub(' +', ' ', text)

def split_text_into_chunks(text, num_chunks):
    words = text.split()
    chunk_size = len(words) // num_chunks
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

pdf_file = download_pdf(pdf_url)
pdf_text = pdf_to_text(pdf_file)
clean_pdf_text = clean_text(pdf_text)

# Decide on the number of chunks you want
num_chunks = 5
chunks = split_text_into_chunks(clean_pdf_text, num_chunks)

# This will print the chunks, you can process them further as needed
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}: {chunk[:100]}...")  # Printing the first 100 characters of each chunk for brevity


### Data Cleaning
Using the 7B instruct model

### Embedding Function
for ChromaDB

In [None]:
import chromadb as db  #This helps us work with the vectors database
from chromadb.utils import embedding_functions # This helps us fetch our embedding model
from chromadb import Documents, EmbeddingFunction, Embeddings
from mlx_lm import load, generate

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        model, tokenizer = load_model("bert-base-german-cased", "weights/bert_german.npz")
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents somehow
        encodings = self.tokenizer(input, return_tensors="np", padding=True, truncation=True)
        input_ids = mx.array(encodings["input_ids"])
        attention_mask = mx.array(encodings["attention_mask"])
        token_type_ids = mx.array(encodings["token_type_ids"])
        sequence_output, pooled_output = model(
            input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids
        )
        return pooled_output.tolist()

### Data Loading
Loading the data into ChromaDB

In [None]:
import sqlite3
def __split_string_in_half(self, s):
    # Split the string into words
    words = s.split()
    
    # Find the midpoint of the list of words
    mid_index = len(words) // 2
    
    # Adjust the midpoint for even division; you can modify this as needed
    if len(words) % 2 != 0:  # Check if the number of words is odd
        mid_index += 1  # Adjust so the first half gets the extra word
    
    # Split the list into two halves
    first_half_words = words[:mid_index]
    second_half_words = words[mid_index:]
    
    # Rejoin each half into a string
    first_half = ' '.join(first_half_words)
    second_half = ' '.join(second_half_words)
    
    return first_half, second_half

collection_exists = "Green-AI" in [collection.name for collection in self.client.list_collections()]
if not collection_exists:
    con = sqlite3.connect("crawler/mitteilungen.db")
    cur = con.cursor()

    mitteilungen = cur.execute("SELECT * FROM mitteilungen").fetchall()

    con = sqlite3.connect("crawler/green-ai-projekte-eng.db")
    cur = con.cursor()
    projekte = cur.execute("SELECT * FROM projekte").fetchall()

    collection = self.client.create_collection(
        name=f"Green-AI",
        metadata={"hnsw:space": "cosine"}, # l2 is the default
        embedding_function=self.embedding_model,
    )
    documents = []
    metadata = []
    ids = []
    # for mitteilung in mitteilungen:
    #     documents.append(mitteilung[2])
    #     metadata.append({"title": mitteilung[1]})
    #     ids.append("mitteilung " + str(mitteilung[0]))
    for projekt in projekte:
        document_text = projekt[2].replace("\t", " ")
        document_text = document_text.replace("\n", " ")
        document_text = document_text.replace("  ", " ")
        for i, current_text in enumerate(self.__split_string_in_half(document_text)):
            documents.append(current_text)
            metadata.append({"title": projekt[1]})
            ids.append(f"projekt_{str(projekt[0])}_{i}")
    collection.add(
        documents=documents,
        metadatas=metadata,
        ids=ids,
    )

### RAG Class
Document retrieval and prompt engineering

In [None]:
import chromadb
from chromadb.utils import embedding_functions # This helps us fetch our embedding model
from mlx_lm import load, generate

class RAG:

    def __init__(self, translator) -> None:
        self.embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="intfloat/multilingual-e5-large")
        self.client = chromadb.PersistentClient(path="./chromadb")
        self.__init_database()
        self.collection = self.client.get_or_create_collection(
            name=f"Green-AI",
            metadata={"hnsw:space": "cosine"},
            embedding_function=self.embedding_model,
        )
        self.model, self.tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.2-4bit-mlx")
        self.translator = translator

    def query(self, prompt):
        return self.collection.query(query_texts=prompt, n_results=1)

    def generate_text(self, prompt: str, tasks: dict, task_id: str) -> str:
        print("Started the RAG generation")
        translated_text = self.translator.translate_text(prompt, target_lang="EN-US")
        result = self.collection.query(
            query_texts=translated_text.text,
            n_results=1,
        )
        documents = result["documents"][0][0]
        prompt = f"<s>[INST] You are a assistant. Answer the following question based on the attached text. Text: {documents}\n Question: {translated_text.text}[/INST] "
        output = generate(self.model, self.tokenizer, prompt=prompt, verbose=False, max_tokens=1000, repetition_penalty=1.1)
        print(output)
        translated_output = self.translator.translate_text(output, target_lang=translated_text.detected_source_lang)
        tasks[task_id]["result"] = translated_output.text

### LLM Generating Answers
Including a Chat Interface