In [51]:
import os

from smolagents import LiteLLMModel, Tool
from smolagents.agents import CodeAgent

In [52]:
import os


# Configuration Class
class Config:
    def __init__(self):
        self.search_query = "agent"
        self.max_results = 5
        self.output_folder = "data"
        self.base_url = "http://export.arxiv.org/api/query?"
        self.log_db_path = "app_logs.db"
        self.log_file_path = "app.log"

    def load_from_env(self):
        self.search_query = os.environ.get("SEARCH_QUERY", self.search_query)
        self.max_results = int(os.environ.get("MAX_RESULTS", self.max_results))
        self.output_folder = os.environ.get("OUTPUT_FOLDER", self.output_folder)
        self.base_url = os.environ.get("BASE_URL", self.base_url)
        self.log_db_path = os.environ.get("LOG_DB_PATH", self.log_db_path)
        self.log_file_path = os.environ.get("LOG_FILE_PATH", self.log_file_path)
        return self


# Initialize Configuration
config = Config().load_from_env()  # Load from env variables first

In [53]:
pdf_directory = config.output_folder
pdf_files = [
    os.path.join(pdf_directory, f)
    for f in os.listdir(pdf_directory)
    if f.endswith(".pdf")
]

print(f"found {len(pdf_files)} pdf files")

found 49 pdf files


In [54]:
from pypdf import PdfReader

docs = []
for file_path in pdf_files:
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    print(text)
    docs.extend(text)

arXiv:nlin/0211015v1  [nlin.CG]  12 Nov 2002
A New ParameterF to Classify Cellular
Automata Rule Table Space and a Phase
Diagram in λ − F Plane
Sunao Sakai and Megumi Kanno
Faculty of Education, Yamagata University, Yamagata, 990-8560, Japan
Abstract
It is shown that for the N-neighbor and K-state cellular automata, the class II,
class III and class IV patterns coexist at least in the range 1
K ≤ λ ≤ 1 − 1
K . The
mechanism which determines the diﬀerence between the patter n classes at a ﬁxed λ
is found, and it is studied quantitatively by introducing a n ew parameter F . Using
the parameter F and λ, the phase diagram of cellular automata is obtained for
5-neighbor and 4-state cellular automata.
PACS: 89.75.-k Complex Systems
1 Introduction
Cellular automata (CA) has been one of the most studied ﬁelds in the r e-
search of complex systems. Various patterns has been generate d by choosing
the rule tables. Wolfram[1] has classiﬁed these patterns into four r ough cat-
egories: class I (ho

Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 95 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 111 0 (offset 0)
Ignoring wrong pointing object 117 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 151 0 (offset 0)
Ignoring wrong pointing object 153 0 (offset 0)
Ignoring wrong pointing object 160 0 (offset 0)
Ignoring wrong pointing object 168 0 (offset 0)
Ignoring wrong pointing object 171 0 (offset 0)
Ign

On complex dynamics from reversible cellular
automata
Juan Carlos Seck-Tuoh-Mora*, Genaro J. Martinez,
Norberto Hernandez-Romero, Joselito Medina-Marin,
Irving Barragan-Vite
AAI-ICBI-UAEH. Carr Pachuca-Tulancingo Km 4.5.
Pachuca 42184 Hidalgo. Mexico
Unconventional Computing Centre, University of the
West of England, BS16 1QY Bristol, United Kingdom
Escuela Superior de Computo, Instituto Politecnico
Nacional, Mexico
September 2020
Abstract
Complexity has been a recurrent research topic in cellular automata
because they represent systems where complex behaviors emerge from
simple local interactions. A signiﬁcant amount of previous research has
been conducted proposing instances of complex cellular automata; how-
ever, most of the proposed methods are based on a careful search or a
meticulous construction of evolution rules.
This paper presents the emergence of complex behaviors based on re-
versible cellular automata. In particular, this paper shows that reversible
cellular automata rep

In [69]:
class Config:
    DB_NAME = "pdf_documents.db"
    TABLE_NAME = "pdf_documents"
    PAGES_TABLE = "pdf_pages"
    FILES_TABLE = "pdf_files"
    OLLAMA_BASE_URL = "http://localhost:11434"
    EMBEDDING_MODEL_NAME = "mxbai-embed-large"
    EMBEDDING_DB_NAME = "embeddings.db"

config = Config()

In [56]:
import sqlite3
from pypdf import PdfReader


class PDFDatabase:
    def __init__(self, db_name=Config.DB_NAME):
        self.conn = sqlite3.connect(db_name)
        self.cursor = self.conn.cursor()
        self.create_tables()

    def create_tables(self):
        self.cursor.execute(
            f"""
            CREATE TABLE IF NOT EXISTS {Config.TABLE_NAME} (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                file_name TEXT UNIQUE,
                text TEXT
            )
        """
        )

        self.cursor.execute(
            f"""
            CREATE TABLE IF NOT EXISTS {Config.PAGES_TABLE} (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                document_id INTEGER,
                page_number INTEGER,
                text TEXT,
                FOREIGN KEY(document_id) REFERENCES {Config.TABLE_NAME}(id)
            )
        """
        )

        self.cursor.execute(
            f"""
            CREATE TABLE IF NOT EXISTS {Config.FILES_TABLE} (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                file_name TEXT UNIQUE,
                file BLOB
            )
        """
        )
        self.conn.commit()

    def pdf_exists(self, file_name):
        self.cursor.execute(
            f"""
            SELECT 1 FROM {Config.TABLE_NAME} WHERE file_name = ?
        """,
            (file_name,),
        )
        return self.cursor.fetchone() is not None

    def insert_pdf(self, file_path):
        if self.pdf_exists(file_path):
            print(f"Skipping {file_path}, already processed.")
            return

        reader = PdfReader(file_path)
        text = "".join(
            [page.extract_text() + "\n" for page in reader.pages if page.extract_text()]
        )

        with open(file_path, "rb") as file:
            file_data = file.read()

        self.cursor.execute(
            f"""
            INSERT INTO {Config.FILES_TABLE} (file_name, file)
            VALUES (?, ?)
        """,
            (file_path, file_data),
        )

        self.cursor.execute(
            f"""
            INSERT INTO {Config.TABLE_NAME} (file_name, text)
            VALUES (?, ?)
        """,
            (file_path, text),
        )
        document_id = self.cursor.lastrowid

        for page_number, page in enumerate(reader.pages):
            page_text = page.extract_text()
            if page_text:
                self.cursor.execute(
                    f"""
                    INSERT INTO {Config.PAGES_TABLE} (document_id, page_number, text)
                    VALUES (?, ?, ?)
                """,
                    (document_id, page_number, page_text),
                )

        self.conn.commit()

    def close(self):
        self.conn.close()

In [57]:
# Insert PDFs into the database
db = PDFDatabase()
for file_path in pdf_files:
    db.insert_pdf(file_path)
db.close()

print("PDF files, content, and pages inserted into the database successfully.")

Skipping data\0211015v1.pdf, already processed.
Skipping data\0307176v1.pdf, already processed.
Skipping data\0401123v1.pdf, already processed.
Skipping data\0404003v2.pdf, already processed.
Skipping data\0410563v2.pdf, already processed.
Skipping data\0502061v1.pdf, already processed.
Skipping data\0504058v1.pdf, already processed.
Skipping data\0505033v1.pdf, already processed.
Skipping data\0609068v1.pdf, already processed.
Skipping data\0702046v1.pdf, already processed.
Skipping data\0711.1349v1.pdf, already processed.
Skipping data\0811.1513v1.pdf, already processed.
Skipping data\0906.3213v1.pdf, already processed.
Skipping data\1001.5471v2.pdf, already processed.
Skipping data\1003.1983v1.pdf, already processed.
Skipping data\1004.1830v1.pdf, already processed.
Skipping data\1009.4509v1.pdf, already processed.
Skipping data\1012.1220v1.pdf, already processed.
Skipping data\1102.2315v1.pdf, already processed.
Skipping data\1103.2119v1.pdf, already processed.
Skipping data\1105.5

In [58]:
import requests
import json


def generate_embeddings(text, model_name=config.EMBEDDING_MODEL_NAME):
    """Generate embeddings for the given text using the specified model."""
    try:
        # Send a POST request to generate embeddings
        url = f"{config.OLLAMA_BASE_URL}/api/embed"
        data = {"input": text, "model": model_name}
        response = requests.post(url, json=data)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response
            embeddings = response.json()
            # print("Embeddings:")
            # pretty_json = json.dumps(embeddings, indent=4)
            # print(pretty_json)
            return embeddings["embeddings"]
        else:
            print(f"Failed to generate embeddings. Status code: {response.status_code}")
            print("Response:", response.text)
            return None

    except requests.ConnectionError:
        print(
            "Failed to connect to the Ollama server. Make sure it is running locally and the URL is correct."
        )
        return None
    except json.JSONDecodeError:
        print("Failed to parse JSON response from Ollama server.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# Example usage
text = "Hello, world!"
res = generate_embeddings(text)
print(res)

[[0.017394623, 0.044793025, 0.0005329998, -0.009000013, -0.045380898, 0.008544399, 0.072358154, 0.042427715, 0.0476788, 0.002133881, 0.0070724227, 0.004481849, -0.005318067, 0.007226094, -0.05024973, -0.0025904828, -0.009076138, -0.038527913, -0.043206822, 0.0018977913, -0.019916972, 0.02116807, -0.07044214, -0.020155963, 0.012409212, 0.019971674, 0.046048257, -0.004015819, 0.04798826, 0.047921292, -0.016926793, 0.009403972, -0.014632336, -0.056704026, -0.026213864, -0.01050382, 0.022746101, -0.01734599, -0.029320415, -0.036378894, 0.029179433, 0.012595907, 0.053485885, -0.029422527, -0.07360141, 0.019871961, 0.0023999002, 0.013447176, 0.05738303, -0.041129053, 0.009918008, 0.022733316, -0.012682854, -0.017570833, -0.012408084, 0.0031584452, -0.048598524, 0.03101221, -0.020229246, 0.03173585, 0.012432155, 0.038107116, 0.043709937, -0.0685571, -0.0024307813, 0.03702609, -0.021566775, -0.02743357, 0.037042916, -0.0015403832, -0.04436249, 0.027944624, 0.0059562526, -0.00311677, -0.0481047

In [59]:
import numpy as np

# Function to serialize float32 list to binary format compatible with sqlite-vec
def serialize_f32(vec):
    return np.array(vec, dtype=np.float32).tobytes()

def reciprocal_rank_fusion(fts_results, vec_results, k=60):
    rank_dict = {}

    # Process FTS results
    for rank, (id,) in enumerate(fts_results):
        if id not in rank_dict:
            rank_dict[id] = 0
        rank_dict[id] += 1 / (k + rank + 1)

    # Process vector results
    for rank, (rowid, distance) in enumerate(vec_results):
        if rowid not in rank_dict:
            rank_dict[rowid] = 0
        rank_dict[rowid] += 1 / (k + rank + 1)

    # Sort by RRF score
    sorted_results = sorted(rank_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

def or_words(input_string):
    # Split the input string into words
    words = input_string.split()

    # Join the words with ' OR ' in between
    result = ' OR '.join(words)

    return result

def lookup_row(id):
    row_lookup = cur.execute('''
    SELECT content FROM pdf_lookup WHERE id = ?
    ''', (id,)).fetchall()
    content = ''
    for row in row_lookup:
        content= row[0]
        break
    return content


In [70]:
import sqlite_vec

# Create an in memory sqlite db
db = sqlite3.connect(config.EMBEDDING_DB_NAME)
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

sqlite_version, vec_version = db.execute(
    "select sqlite_version(), vec_version()"
).fetchone()
print(f"sqlite_version={sqlite_version}, vec_version={vec_version}")


sqlite_version=3.45.1, vec_version=v0.1.6


In [61]:
data = generate_embeddings('The quick brown fox')
dims = len(data[0])
print ('Dims in Vector Embeddings:', dims)

Dims in Vector Embeddings: 1024


In [62]:
cur = db.cursor()
cur.execute('CREATE VIRTUAL TABLE IF NOT EXISTS pdf_fts USING fts5(id UNINDEXED, content, tokenize="porter unicode61");')

# sqlite-vec always adds an ID field
cur.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS pdf_vec USING vec0(embedding float[''' + str(dims) + '])''')

# Create a content lookup table with an index on the ID
cur.execute('CREATE TABLE IF NOT EXISTS pdf_lookup (id INTEGER PRIMARY KEY AUTOINCREMENT, content TEXT);')

<sqlite3.Cursor at 0x24ede91d2c0>

In [63]:
fts_data = [
    (1, 'The quick brown fox jumps over the lazy dog.'),
    (2, 'Artificial intelligence is transforming the world.'),
    (3, 'Climate change is a pressing global issue.'),
    (4, 'The stock market fluctuates based on various factors.'),
    (5, 'Remote work has become more prevalent during the pandemic.'),
    (6, 'Electric vehicles are becoming more popular.'),
    (7, 'Quantum computing has the potential to revolutionize technology.'),
    (8, 'Healthcare innovation is critical for societal well-being.'),
    (9, 'Space exploration expands our understanding of the universe.'),
    (10, 'Cybersecurity threats are evolving and becoming more sophisticated.')
]


In [66]:
cur.execute("DELETE FROM pdf_fts")
cur.execute("DELETE FROM pdf_lookup")
cur.execute("DELETE FROM pdf_vec")


cur.executemany('''
INSERT INTO pdf_fts (id, content) VALUES (?, ?)
''', fts_data);


cur.executemany('''
  INSERT INTO pdf_lookup (id, content) VALUES (?, ?)
''', fts_data)


# Generate embeddings for the content and insert into mango_vec
for row in fts_data:
    id, content = row
    embedding = generate_embeddings(content)
    cur.execute('''
    INSERT INTO pdf_vec (rowid, embedding) VALUES (?, ?)
    ''', (id, serialize_f32(list(embedding))))


# Commit changes
db.commit()

In [67]:
def search(fts_search_query: str = "Electric", top_k: int = 2):
    fts_results = cur.execute('''
    SELECT id FROM pdf_fts WHERE pdf_fts MATCH ? ORDER BY rank limit 5
    ''', (or_words(fts_search_query),)).fetchall()

    # Vector search query
    query_embedding = generate_embeddings(fts_search_query)
    vec_results = cur.execute('''
    SELECT rowid, distance FROM pdf_vec WHERE embedding MATCH ? and K = ?
    ORDER BY distance
    ''', [serialize_f32(list(query_embedding)), top_k]).fetchall()

    # Combine results using RRF
    combined_results = reciprocal_rank_fusion(fts_results, vec_results)

    # Print combined results
    for id, score in combined_results:
        print(f'ID: {id}, Content: {lookup_row(id)}, RRF Score: {score}')    


In [68]:
print("---- technology ----")
search("technology")
print("---- Electric ----")
search("Electric")  
print("---- Medical ----")
search("medical")


---- technology ----
ID: 7, Content: Quantum computing has the potential to revolutionize technology., RRF Score: 0.03278688524590164
ID: 2, Content: Artificial intelligence is transforming the world., RRF Score: 0.016129032258064516
---- Electric ----
ID: 6, Content: Electric vehicles are becoming more popular., RRF Score: 0.03278688524590164
ID: 2, Content: Artificial intelligence is transforming the world., RRF Score: 0.016129032258064516
---- Medical ----
ID: 8, Content: Healthcare innovation is critical for societal well-being., RRF Score: 0.01639344262295082
ID: 3, Content: Climate change is a pressing global issue., RRF Score: 0.016129032258064516


In [None]:
class RetrieverTool(Tool):
    name = "retriever"
    description = (
        "Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query."
    )
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, vector_store, **kwargs):
        super().__init__(**kwargs)
        self.vector_store = vector_store

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"
        docs = self.vector_store.similarity_search(query, k=3)
        return "\nRetrieved documents:\n" + "".join(
            [f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
        )


retriever_tool = RetrieverTool(vector_store)
