In [30]:
import duckdb

In [31]:
con = duckdb.connect("database_for_testing_duck.duckdb")

In [32]:
con.execute("""
    CREATE TABLE IF NOT EXISTS bank AS 
    SELECT * FROM read_csv('bank_information.csv')
""")
con.execute("SHOW ALL TABLES").fetchdf()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,database_for_testing_duck,main,bank,"[index, age, job, marital, education, default,...","[BIGINT, BIGINT, VARCHAR, VARCHAR, VARCHAR, VA...",False


In [33]:
con.execute("SELECT * FROM bank WHERE duration < 100 LIMIT 5").fetchdf()

Unnamed: 0,index,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,9,25,services,single,high.school,no,yes,no,telephone,may,...,1,999,0,nonexistent,,,,4.857,,False
1,10,41,blue-collar,married,unknown,unknown,no,no,telephone,may,...,1,999,0,nonexistent,,,,4.857,,False
2,20,30,unemployed,married,high.school,no,no,no,telephone,may,...,1,999,0,nonexistent,,,,4.857,,False
3,25,35,technician,married,university.degree,no,no,yes,telephone,may,...,1,999,0,nonexistent,,,,4.857,,False
4,26,59,technician,married,unknown,no,yes,no,telephone,may,...,1,999,0,nonexistent,,,,4.857,,False


In [34]:
rel = con.table("bank")
rel.columns

['index',
 'age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'y']

In [35]:
rel.filter("duration < 100").project("job,education,loan").order("job").limit(3).df()

Unnamed: 0,job,education,loan
0,blue-collar,unknown,no
1,blue-collar,unknown,no
2,blue-collar,basic.9y,no


In [36]:
res = duckdb.query("""SELECT 
                            job,
                            COUNT(*) AS total_clients_contacted,
                            AVG(duration) AS avg_campaign_duration,
                        FROM 
                            'bank_information.csv'
                        WHERE 
                            age > 30
                        GROUP BY 
                            job
                        ORDER BY 
                            total_clients_contacted DESC;""")
res.df()

Unnamed: 0,job,total_clients_contacted,avg_campaign_duration
0,admin.,26,245.807692
1,blue-collar,23,320.695652
2,technician,13,385.153846
3,management,9,283.444444
4,services,8,323.75
5,unknown,4,239.25
6,housemaid,3,273.333333
7,entrepreneur,3,601.666667
8,retired,2,258.0
9,unemployed,2,345.0


In [37]:
# con.close()

In [38]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import StorageContext

from IPython.display import Markdown, display

In [39]:
import os
from llama_index.llms.openai import OpenAI
import config

config.apiToken()

llm = OpenAI(model="gpt-4o",api_key=os.environ["OPENAI_API_KEY"])

In [40]:
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
)

In [41]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [42]:
documents = SimpleDirectoryReader("Data").load_data()

In [54]:
vector_store = DuckDBVectorStore(database_name = "datacamp.duckdb",table_name = "blog",persist_dir="./", embed_dim=1536)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

CatalogException: Catalog Error: Table with name "blog" already exists!

In [53]:
# con = duckdb.connect("datacamp.duckdb")

con.execute("SHOW ALL TABLES").fetchdf()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,database_for_testing_duck,main,bank,"[index, age, job, marital, education, default,...","[BIGINT, BIGINT, VARCHAR, VARCHAR, VARCHAR, VA...",False


In [49]:
query_engine = index.as_query_engine()
response = query_engine.query("Who wrote 'DuckDB Tutorial: Building AI Projects'?")
display(Markdown(f"<b>{response}</b>"))

<b>The 'DuckDB Tutorial: Building AI Projects' was written by Abid Ali Awan.</b>

In [50]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

chat_engine = CondensePlusContextChatEngine.from_defaults(
    index.as_retriever(),
    memory=memory,
    llm=llm
)

response = chat_engine.chat(
    "How do I implement DuckDB with LLM? Provide step by step instructions"
)

display(Markdown(response.response))

To implement DuckDB with a Large Language Model (LLM), you can follow these steps to build a Retrieval-Augmented Generation (RAG) application using DuckDB as a vector database and retriever. Here's a step-by-step guide based on the tutorial:

1. **Install Necessary Packages**:
   - First, ensure you have Python installed on your system. Then, install the required packages using pip:
     ```bash
     %pip install duckdb
     %pip install llama-index
     %pip install llama-index-vector-stores-duckdb
     ```

2. **Set Up Your Environment**:
   - Import the necessary Python packages. This typically includes DuckDB and any other libraries you need for your specific application, such as LlamaIndex for handling the index and retrieval processes.

3. **Create and Retrieve the Index**:
   - Use DuckDB to create a vector database. This involves setting up your database schema and loading your data into DuckDB.
   - You can execute SQL queries using DuckDB's query function to manipulate and retrieve data. For example, you might want to filter data based on certain criteria or aggregate information.

4. **Build the RAG Application**:
   - Integrate DuckDB with LlamaIndex to build your RAG application. This involves using DuckDB as the backend for storing and retrieving vectors.
   - Implement the logic for your application, which might include querying the database to retrieve relevant information based on input from the LLM.

5. **Execute Queries and Analyze Data**:
   - Use SQL queries to interact with your data. For example, you can execute queries to find specific information, such as job titles of clients over a certain age, count the number of clients contacted, or calculate average campaign durations.
   - Convert the results into a format suitable for further analysis or use within your application.

6. **Close the Connection**:
   - After completing your operations, close the connection to the DuckDB database to release any resources and prevent potential memory leaks.

This setup allows you to leverage DuckDB's high-performance capabilities for analytical queries and integrate it with LLMs for enhanced data retrieval and processing in AI applications.

In [51]:
response = chat_engine.chat(
    "Could you please provide more details about the integration part with the LLM and how I create a memory buffer?"
)
display(Markdown(response.response))

To integrate DuckDB with a Large Language Model (LLM) and create a memory buffer, you can follow these steps based on the context provided:

1. **Set Up the LLM and Embedding Model**:
   - Use the GPT-4o model from OpenAI for your language model. You need to provide a model name and an API key to create the LLM client.
   - For embeddings, use the OpenAI text-embedding-3-small model. This will help in converting your data into embeddings that can be stored in the vector store.

   ```python
   import os
   from llama_index.llms.openai import OpenAI
   from llama_index.embeddings.openai import OpenAIEmbedding

   llm = OpenAI(model="gpt-4o", api_key=os.environ["OPENAI_API_KEY"])
   embed_model = OpenAIEmbedding(model="text-embedding-3-small")
   ```

2. **Set Global Settings for LlamaIndex**:
   - Configure the LlamaIndex settings to use the LLM and embedding model globally. This ensures that all functions within LlamaIndex can access these models by default.

   ```python
   from llama_index.core import Settings

   Settings.llm = llm
   Settings.embed_model = embed_model
   ```

3. **Load Data and Create Vector Store**:
   - Load your PDF files or other data into the system using a directory reader. Convert this data into embeddings and store them in a vector store using DuckDB.

   ```python
   from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
   from llama_index.vector_stores.duckdb import DuckDBVectorStore

   documents = SimpleDirectoryReader("Data").load_data()
   ```

4. **Integrate with DuckDB**:
   - Use DuckDB as a vector database to store and retrieve embeddings. This involves creating a vector store called “blog” using an existing DuckDB database.

5. **Create a Memory Buffer**:
   - Implement a memory buffer to retain context between interactions. This can be done by storing previous interactions or queries in the vector store and retrieving them as needed to maintain context.

6. **Build the Chat Engine**:
   - Develop a chat engine that uses the vector store to provide responses. The chat engine should be able to remember previous conversations and use the memory buffer to provide contextually relevant answers.

   ```python
   response = chat_engine.chat("Could you please provide more details about the Post Fine-Tuning Steps?")
   ```

By following these steps, you can effectively integrate DuckDB with an LLM and create a memory buffer to enhance the interaction capabilities of your application. This setup allows for efficient data retrieval and contextual understanding in AI-driven projects.

In [52]:
from sqlalchemy import create_engine

engine = create_engine("duckdb:///datacamp.duckdb")
with engine.connect() as connection:
    cursor = connection.exec_driver_sql("SELECT * FROM bank LIMIT 3")
    print(cursor.fetchall())

ProgrammingError: (duckdb.duckdb.CatalogException) Catalog Error: Table with name bank does not exist!
Did you mean "duckdb_indexes"?
LINE 1: SELECT * FROM bank LIMIT 3
                      ^
[SQL: SELECT * FROM bank LIMIT 3]
(Background on this error at: https://sqlalche.me/e/20/f405)