Merge branch 'georgia-tech-db:master' into master

georgia-tech-db · Oct 26, 2023 · 1856ff1 · 1856ff1
2 parents 4695199 + c3b45b6
commit 1856ff1
Show file tree

Hide file tree

Showing 244 changed files with 19,439 additions and 5,823 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -56,7 +56,18 @@ workflows:
                 ignore: 
                   - master
                   - staging
-
+        ################################
+        #### LONG INTEGRATION TESTS: PR
+        ################################
+        ################################
+        - Linux:
+            name: Long Integration Test (Cache) | v3.10 | Linux
+            mode: LONG INTEGRATION CACHE
+            filters:
+              branches:
+                ignore: 
+                  - master
+                  - staging
         ################################
         #### SHORT THIRDPARTY TESTS: PR
         ################################
@@ -201,6 +212,12 @@ jobs:
       - restore_cache:
           keys:
             - v1-model_cache-{{ checksum "setup.py" }}
+
+      # First try restoring the testmondata from PR, than staging.
+      - restore_cache:
+          keys:
+            - v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-
+            - v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}-
 
       - run:
           name: Install EvaDB package from GitHub repo with all dependencies
@@ -213,16 +230,16 @@ jobs:
             pip install --upgrade pip
             if [ $RAY = "ENABLED" ]; then
               if [ $PY_VERSION != "3.11" ]; then
-                pip install ".[dev,ray,qdrant]"
+                pip install ".[dev,ray,qdrant,pinecone,chromadb]"
               else
-                pip install ".[dev]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
+                pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
               fi
               python -c "import yaml;f = open('evadb/evadb.yml', 'r+');config_obj = yaml.load(f, Loader=yaml.FullLoader);config_obj['experimental']['ray'] = True;f.seek(0);f.write(yaml.dump(config_obj));f.truncate();"
             else
               if [ $PY_VERSION != "3.11" ]; then
-                pip install ".[dev,ludwig,qdrant]"
+                pip install ".[dev,ludwig,qdrant,pinecone,chromadb]"
               else
-                pip install ".[dev]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
+                pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
               fi
             fi
 
@@ -237,6 +254,7 @@ jobs:
             if [[ $PY_VERSION = "3.10" ]] || [[ $PY_VERSION = "3.11" ]]; then
               export SETUPTOOLS_USE_DISTUTILS=stdlib
             fi
+            set +e # To make sure the later cache step is not skipped.
             bash script/test/test.sh -m "<< parameters.mode >>"
 
       # Enable cache save conditionally (to avoid empty cache in Notebooks)
@@ -251,6 +269,25 @@ jobs:
                   - /home/circleci/.cache/torch/
                   - /home/circleci/.cache/gpt4all/
 
+    # Collect the testmondata only for long intergration tests
+      - when:
+          condition:
+            or:
+              - equal: [ LONG INTEGRATION CACHE, << parameters.mode >> ]
+              - and:
+                - equal: [ LONG INTEGRATION, << parameters.mode >> ]
+                - equal: [ staging, << pipeline.git.branch >> ]
+                - equal: [ "3.10", << parameters.v >> ]
+                - equal: [ DISABLED, << parameters.ray >>]
+          steps:
+          - save_cache:
+              key: v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-{{ epoch }}
+              paths:
+                - .testmondata
+                - .testmondata-shm
+                - .testmondata-wal
+
+
       - save_cache:
           key: v1-pip-wheel_cache-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}
           paths:
@@ -449,7 +486,7 @@ jobs:
               source test_evadb/bin/activate
               pip install --upgrade pip
               pip debug --verbose
-              pip install ".[dev,ludwig,qdrant,forecasting]"
+              pip install ".[dev,ludwig,qdrant,forecasting,pinecone,chromadb]"
               source test_evadb/bin/activate
               bash script/test/test.sh -m "<< parameters.mode >>"
 

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -15,7 +15,9 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          ref: master
+          fetch-depth: 0
+      - name: Switch to master
+        run: git checkout master
       - name: Install env.
         run: |
           python -m venv test_evadb

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,89 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### [Deprecated]
 ### [Removed]
 
+##  [0.3.7] - 2023-09-30
+
+* PR #1239: release 0.3.7 
+* PR #1238: Revert "add stable diffusion integration" 
+* PR #1111: add stable diffusion integration 
+* PR #1228: fix: improve messaging and fix bugs 
+* PR #1230: Force fresh .testmondata cache 
+* PR #1229: Fix staging CI setup 
+* PR #1225: fix: merging dataframes with None value 
+* PR #1223: fix: native db bugs 
+* PR #1209: feat: add testmon in pytest 
+* PR #1201: Improve Documentation For Model Training 
+* PR #1210: chore
+* PR #1212: fix: 1199 enhance similarity search test to make index is used 
+* PR #1211: fix: evadb_client fails on launch 
+* PR #1203: fix: chatgpt exact cache 
+* PR #1208: fix: if create table fails, we were not cleaning the entry in catalog 
+* PR #1204: feat: print error msg if creating built-in functions failed 
+* PR #1180: Adding SET statement for configuration management 
+* PR #1198: Chroma vector store 
+* PR #1162: Adding support for Sklearn linear regression in EvaDB 
+* PR #1196: bug: fix links in model training notebook 
+* PR #1167: Model training Colab Notebook 
+* PR #1185: Add support for function calls without input arguments: `FUNC
+* PR #1192: Skip pinecone test if pinecone is not installed 
+* PR #1191: fix: unclosed file error 
+* PR #1190: fix: release change log 
+* PR #1153: feat: integrate with pgvector 
+* PR #1135: Pinecone vector store 
+* PR #1165: bug: drop bug fix 
+* PR #1168: LLM app fixes 
+* PR #1182: Bump Version to v0.3.7+dev 
+* PR #1181: releass: bump a version further to skip cached wheel 
+
+##  [0.3.6] - 2023-09-21
+
+* PR #1181: releass: bump a version further to skip cached wheel 
+* PR #1179: fix: release change log 
+* PR #1177: fix: release action 
+* PR #1176: fix: release 
+* PR #1174: fix: trigger build 
+* PR #1173: fix: release on success only and credential to pass protection rule 
+* PR #1172: Release v0.3.5 
+* PR #1169: fix: hotfix for the failing staging build 
+* PR #1159: Load query exception for invalid file format  
+* PR #1164: Fix model train with Ludwig on Colab 
+* PR #1155: feat: add support for enabling ORDER BY on non projected columns 
+* PR #1158: Adding Algolia search to Eva-DB Docs 
+* PR #1146: `CREATE OR REPLACE FUNCTION` 
+* PR #1154: feat: add github actions to sync and release wheel 
+* PR #1123: Updates evadb apps 
+* PR #1157: chore
+* PR #977: chore
+* PR #1137: feat: add support for if not exists in create db  
+* PR #1132: docs: add home sale forecast usecase into documentation 
+* PR #1136: feat: support order by using the projection columns 
+* PR #1030: Add model inference documentation 
+* PR #1134: ci: staging build fix 
+* PR #1124: fix: third-party test 
+* PR #1118: Add a model forecasting notebook in tutorials 
+* PR #1125: feat: create table in integration table from evadb select query 
+* PR #1122: fix: flaky ci unit tests 
+* PR #1113: fix: update docs and notebooks 
+* PR #1114: feat: Improve db integration 
+* PR #1108: Set the right output column type for forecast functions 
+* PR #1107: Added null handling and tests for case insensitive string matching 
+* PR #1087: Support `SELECT expr;` which does not require `FROM table`  
+* PR #1090: Making `Ludwig` and `HuggingFace` case insensitive 
+* PR #1027: Adding support for MariaDb as backend for EvaDB 
+* PR #1101: Fix forecasting integration test 
+* PR #1094: Fixes date and frequency issues in forecasting 
+* PR #1096: Rename l_plan and p_plan 
+* PR #1091: fix: evadb is now consistent with lowercase 
+* PR #1092: feat: Drop database 
+* PR #1082: feat: create index if exists 
+* PR #1088: fix: df merging issues when multiple predicates  
+* PR #1086: Update parameters documentation for forecast 
+* PR #1084: Fix column name related issue for Forecast functions 
+* PR #1073: fix: create index from single document 
+* PR #1080: `pandas.DataFrame.fillna` is deprecated 
+* PR #1060: Bump v0.3.5+dev 
+* PR #1062: Update UDF to function in model-forecasting.rst 
+
 ##  [0.3.4] - 2023-09-06
 
 * PR #1057: fix: staging build fix 

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
   </a>
 </p>
 
-<p align="center"><i><b>Bring AI inside your database system and build AI-powered apps</b></i></p>
+<p align="center"><i><b>Database system for AI-powered apps</b></i></p>
 
 <p align="center">
 <a href="https://github.com/georgia-tech-db/evadb/fork" target="blank">
@@ -79,7 +79,7 @@ EvaDB enables software developers to build AI apps in a few lines of code. Its p
 
 👋 Hey! If you're excited about our vision of bringing AI inside database systems, show some ❤️ by: 
 <ul>
-  <li> 🐙 giving a ⭐ on our <a href="https://github.com/georgia-tech-db/evadb">EvaDB repo on Github</a>
+  <li> ⭐ starring our <a href="https://github.com/georgia-tech-db/evadb">GitHub 🐙 Repo</a>
   <li> 📟 joining our <a href="https://evadb.ai/community">Slack Community</a>
   <li> 🐦 following us on <a href="https://twitter.com/evadb_ai">Twitter</a>
   <li> 📝 following us on <a href="https://medium.com/evadb-blog">Medium</a>
@@ -116,7 +116,7 @@ Our target audience is software developers who may not necessarily have a backgr
 
 <details>
 <ul>
-<li>Connect EvaDB to your database system with the `CREATE DATABASE` statement.</li>
+<li>Connect EvaDB to your SQL and vector database systems with the <a href="https://evadb.readthedocs.io/en/stable/source/reference/databases/postgres.html">`CREATE DATABASE`</a> and <a href="https://evadb.readthedocs.io/en/stable/source/reference/evaql/create.html#create-index">`CREATE INDEX`</a> statements.</li>
 <li>Write SQL queries with AI functions to get inference results:</li>
    <ul>
    <li>Pick a pre-trained AI model from Hugging Face, Open AI, Ultralytics, PyTorch, and built-in AI frameworks for generative AI, NLP, and vision applications;</li>  
@@ -130,23 +130,26 @@ Follow the [getting started](https://evadb.readthedocs.io/en/stable/source/overv
 
 ## Illustrative Queries
 
-* Run the MNIST Image Classification model to obtain digit labels for each frame in the video.
+* Get insights about Github stargazers using GPT4.
 
 ```sql
-SELECT MnistImageClassifier(data).label FROM mnist_video;
+SELECT name, country, email, programming_languages, social_media, GPT4(prompt,topics_of_interest)
+FROM gpt4all_StargazerInsights;
+
+--- Prompt to GPT-4
+You are given 10 rows of input, each row is separated by two new line characters.
+Categorize the topics listed in each row into one or more of the following 3 technical areas - Machine Learning, Databases, and Web development. If the topics listed are not related to any of these 3 areas, output a single N/A. Do not miss any input row. Do not add any additional text or numbers to your output.
+The output rows must be separated by two new line characters. Each input row must generate exactly one output row. For example, the input row [Recommendation systems, Deep neural networks, Postgres] must generate only the output row [Machine Learning, Databases].
+The input row [enterpreneurship, startups, venture capital] must generate the output row N/A.
 ```
 
-* Build a vector index on the feature embeddings returned by the SIFT Feature Extractor on a collection of Reddit images.
+* Build a vector index on the feature embeddings returned by the SIFT Feature Extractor on a collection of Reddit images. Return the top-5 similar images for a given image.
 
 ```sql
 CREATE INDEX reddit_sift_image_index
     ON reddit_dataset (SiftFeatureExtractor(data))
     USING FAISS
-```
-
-* Retrieve the top-5 most similar images for the given image using the index.
 
-```sql
 SELECT name FROM reddit_dataset ORDER BY
     Similarity(
         SiftFeatureExtractor(Open('reddit-images/g1074_d4mxztt.jpg')),
@@ -171,25 +174,20 @@ Here are some illustrative AI apps built using EvaDB (each notebook can be opene
 
 <details>
 
-* Store the text returned by a Speech Recognition model on the audio component of a video in a table.
+* Get a transcript from a video stored in a table using a Speech Recognition model. Then, ask questions on the extracted transcript using ChatGPT.
 
 ```sql
 CREATE TABLE text_summary AS
     SELECT SpeechRecognizer(audio) FROM ukraine_video;
-```
-
-* Run ChatGPT on the `text` column in a table.
-
-```sql
 SELECT ChatGPT('Is this video summary related to Ukraine russia war', text)
     FROM text_summary;
 ```
 
-* Train an ML model using the <a href="https://ludwig.ai/latest/">Ludwig AI</a> engine to predict a column in a table.
+* Train a classic ML model for prediction using the <a href="https://ludwig.ai/latest/">Ludwig AI</a> engine.
 
 ```sql
 CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM
-( SELECT * FROM HomeRentals )
+(SELECT * FROM HomeRentals)
 TYPE Ludwig
 PREDICT 'rental_price'
 TIME_LIMIT 120;
@@ -202,7 +200,7 @@ TIME_LIMIT 120;
 <details>	
 EvaDB's AI-centric query optimizer takes a query as input and generates a query plan. The query engine takes the query plan and hits the relevant backends to efficiently process the query:
 1. SQL Database Systems (Structured Data)
-2. AI Frameworks (Transform Unstructured Data to Structured Data, Unstructured data includes PDFs, images, podcasts, etc. stored on cloud buckets or local filesystem)
+2. AI Frameworks (Transform Unstructured Data to Structured Data; Unstructured data includes PDFs, text, images, etc. stored locally or on the cloud)
 3. Vector Database Systems (Feature Embeddings)
 
 <p align="center">

diff --git a/apps/privategpt/privateGPT.py b/apps/privategpt/privateGPT.py
@@ -28,30 +28,26 @@ def query(question):
         SELECT data
         FROM embedding_table
         ORDER BY Similarity(embedding('{question}'), features)
-        ASC LIMIT 3;
+        LIMIT 5;
     """
     ).df()
 
     # Merge all context information.
-    context = "; \n".join(context_docs["embedding_table.data"])
+    context = "\n".join(context_docs["embedding_table.data"])
 
     # run llm
-    messages = [
-        {"role": "user", "content": f"Here is some context:{context}"},
-        {
-            "role": "user",
-            "content": f"Answer this question based on context: {question}",
-        },
-    ]
-    llm = GPT4All("ggml-gpt4all-j-v1.3-groovy")
-    llm.model.set_thread_count(16)
+    llm = GPT4All("ggml-model-gpt4all-falcon-q4_0.bin")
+    llm.set_thread_count(16)
 
-    answer = llm.chat_completion(messages, verbose=False, streaming=False)
+    message = f"""If the context is not relevant, please answer the question by using your own knowledge about the topic.
+    
+    {context}
+    
+    Question : {question}"""
 
-    print("\n> Answer:")
-    print(answer["choices"][0]["message"]["content"])
-    print("\n>> Context: ")
-    print(context)
+    answer = llm.generate(message)
+
+    print("\n> Answer:", answer)
 
 
 print(

diff --git a/apps/rest/README.md b/apps/rest/README.md
@@ -0,0 +1,13 @@
+We can now access evadb through REST apis. The two API's used are as follows:
+
+1) Query API
+    * Through this API, the user can access EVADB by passing the query to the given request
+        Eg: http://127.0.0.1:5000/query is the API to access the API. The user can then enter the API as required. 
+
+2) Upload API
+    * Users can upload any files of type {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif', 'mp3'} to the EVA DB server
+    * This can be done through a post request containing a file in the request section
+    * The API also allows the user to upload the document through "upload" button. 
+    * The files are stored in the file section which can then be accessed through the query API with the "load" query.
+
+