Skip to content

Commit

Permalink
ML/RAG: Fix testing
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Nov 8, 2023
1 parent f760e4c commit 832c3bc
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 10 deletions.
Expand Up @@ -54,7 +54,7 @@
"id": "774c3e61",
"metadata": {},
"source": [
"Install required Python packages"
"Install required Python packages, and import Python modules."
]
},
{
Expand All @@ -80,7 +80,11 @@
"\n",
"from langchain.document_loaders import PyPDFLoader\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter"
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"# TODO: Bring this into the `crate-python` driver.\n",
"from cratedb_toolkit.sqlalchemy.patch import patch_inspector\n",
"patch_inspector()"

This comment has been minimized.

Copy link
@amotl

amotl Nov 9, 2023

Author Member

Address this, see #137.

]
},
{
Expand Down Expand Up @@ -302,17 +306,19 @@
"pages_embeddings = embeddings.embed_documents(pages_text)\n",
"\n",
"# The next step creates a dataframe that contains the text of the documents and their embeddings. \n",
"# The embeddings will be stored in CrateDB using the FLOAT_VECTOR type.\n",
"df = pd.DataFrame(list(zip(pages_text, pages_embeddings)),columns =['text', 'embedding'])\n",
"df = pd.DataFrame(list(zip(pages_text, pages_embeddings)), columns=['text', 'embedding'])\n",
"\n",
"create_table = sa.text(\"CREATE TABLE IF NOT EXISTS text_data (text TEXT, embedding FLOAT_VECTOR(1536))\")\n",
"# The embeddings will be stored in CrateDB using the FLOAT_VECTOR type.\n",
"engine = sa.create_engine(CONNECTION_STRING, echo=False)\n",
"with engine.connect() as connection:\n",
"\n",
"with engine.connect() as con:\n",
" con.execute(create_table)\n",
" # Create database table.\n",
" connection.execute(sa.text(\"CREATE TABLE IF NOT EXISTS text_data (text TEXT, embedding FLOAT_VECTOR(1536));\"))\n",
"\n",
" # Write text and embeddings to CrateDB database.\n",
" df.to_sql(name=\"text_data\", con=connection, if_exists=\"append\", index=False)\n",
" connection.execute(sa.text(\"REFRESH TABLE text_data;\"))\n",
"\n",
"# The text and embeddings are written to CrateDB database:\n",
"df.to_sql(name='text_data', con=engine, if_exists='append', index=False)\n",
"df.head(5)"
]
},
Expand Down
4 changes: 4 additions & 0 deletions topic/machine-learning/llm-langchain/pyproject.toml
Expand Up @@ -31,6 +31,10 @@ nb_diff_ignore = [
"/metadata/language_info",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",

# cratedb-vectorstore-rag-openai.ipynb
"/cells/13/outputs",
"/cells/17/outputs",
]

[tool.coverage.run]
Expand Down
3 changes: 3 additions & 0 deletions topic/machine-learning/llm-langchain/requirements.txt
@@ -1,6 +1,8 @@
# Real.
crash
crate[sqlalchemy]
cratedb-toolkit

# langchain[cratedb,openai]
# pueblo[env,nlp]==0.0.3
pydantic>=1,<3
Expand All @@ -10,5 +12,6 @@ requests-cache<2
unstructured<0.11

# Development.
cratedb-toolkit @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main
langchain[cratedb,openai] @ git+https://github.com/crate-workbench/langchain.git@testing#subdirectory=libs/langchain
pueblo[nlp] @ git+https://github.com/pyveci/pueblo.git@main
3 changes: 2 additions & 1 deletion topic/machine-learning/llm-langchain/test.py
Expand Up @@ -17,11 +17,12 @@ def cratedb() -> DatabaseAdapter:


@pytest.fixture(scope="function", autouse=True)
def db_init(cratedb):
def reset_database(cratedb):
"""
Initialize database.
"""
cratedb.run_sql("DROP TABLE IF EXISTS mlb_teams_2012;")
cratedb.run_sql("DROP TABLE IF EXISTS text_data;")
time.sleep(0.01)


Expand Down

0 comments on commit 832c3bc

Please sign in to comment.