GitHub Data Source Integration (#1233)

- [x] GitHub Data Source Integration - [x] Batching support for native storage engine. We can not do batching in storage engine, which does not work with limit. Revert the change. - [x] Full NamedUser table support - [x] Enable circle ci local PR cache for testmondata - [x] Native storage engine `read` refactory - [x] Testcases - [x] Github data source documentation
georgia-tech-db · Oct 2, 2023 · 495ce7d · 495ce7d
1 parent e8a181c
commit 495ce7d
Show file tree

Hide file tree

Showing 17 changed files with 464 additions and 59 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -213,10 +213,11 @@ jobs:
           keys:
             - v1-model_cache-{{ checksum "setup.py" }}
 
-      # Always restore testmondata from staging, python3.10, ray disabled.
+      # First try restoring the testmondata from PR, than staging.
       - restore_cache:
           keys:
-            - v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}
+            - v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-
+            - v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}-
 
       - run:
           name: Install EvaDB package from GitHub repo with all dependencies
@@ -271,16 +272,21 @@ jobs:
     # Collect the testmondata only for long intergration tests
       - when:
           condition:
-            and:
-              - equal: [ LONG INTEGRATION, << parameters.mode >> ]
-              - equal: [ staging, << pipeline.git.branch >> ]
-              - equal: [ "3.10", << parameters.v >> ]
-              - equal: [ DISABLED, << parameters.ray >>]
+            or:
+              - equal: [ LONG INTEGRATION CACHE, << parameters.mode >> ]
+              - and:
+                - equal: [ LONG INTEGRATION, << parameters.mode >> ]
+                - equal: [ staging, << pipeline.git.branch >> ]
+                - equal: [ "3.10", << parameters.v >> ]
+                - equal: [ DISABLED, << parameters.ray >>]
           steps:
           - save_cache:
               key: v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-{{ epoch }}
               paths:
                 - .testmondata
+                - .testmondata-shm
+                - .testmondata-wal
+
 
       - save_cache:
           key: v1-pip-wheel_cache-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -67,6 +67,7 @@ parts:
           - file: source/reference/databases/sqlite
           - file: source/reference/databases/mysql
           - file: source/reference/databases/mariadb
+          - file: source/reference/databases/github
 
       - file: source/reference/ai/index
         title: AI Engines

diff --git a/docs/source/reference/databases/github.rst b/docs/source/reference/databases/github.rst
@@ -0,0 +1,57 @@
+Github
+==========
+
+The connection to Github is based on the `PyGithub <https://github.com/PyGithub/PyGithub>`_ library.
+
+Dependency
+----------
+
+* PyGithub
+
+
+Parameters
+----------
+
+Required:
+
+* ``owner`` is the owner of the Github repository. For example, georgia-tech-db is the owner of the EvaDB repository.
+* ``repo`` is the name of the Github repository. For example, evadb is the name of this repository.
+
+Optional:
+
+* ``github_token`` is not required for public repositories. However, the rate limit is lower without a valid github_token. Check `Rate limits page <https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits>`_ to learn more about how to check your rate limit status. Check `Managing your personal access tokens page <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_ to learn how to create personal access tokens.
+
+Create Connection
+-----------------
+
+.. code-block:: text
+
+   CREATE DATABASE github_data WITH ENGINE = 'github', PARAMETERS = {
+        "owner": "georgia-tech-db",
+        "repo": "evadb"
+   };
+
+Supported Tables
+----------------
+
+* ``stargazers``: Lists the people that have starred the repository. Check `evadb/third_party/databases/github/table_column_info.py` for all the available columns in the table.
+
+.. code-block:: sql
+
+   SELECT * FROM github_data.stargazers;
+
+Here is the query output:
+
+.. code-block:: 
+
+    +---------------------------------------------------+-----+---------------------------------------------+
+    |                             stargazers.avatar_url | ... |                              stargazers.url |
+    |---------------------------------------------------|-----|---------------------------------------------|
+    | https://avatars.githubusercontent.com/u/105357... | ... |      https://api.github.com/users/jaehobang |
+    | https://avatars.githubusercontent.com/u/436141... | ... | https://api.github.com/users/VineethAljapur |
+    |                                               ... | ... |                                         ... |
+    +---------------------------------------------------+-----+---------------------------------------------+
+
+.. note::
+
+   Looking for another table from Github? You can add a table mapping in `evadb/third_party/databases/github/github_handler.py`, or simply raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
diff --git a/evadb/binder/statement_binder_context.py b/evadb/binder/statement_binder_context.py
@@ -89,7 +89,10 @@ def add_table_alias(self, alias: str, database_name: str, table_name: str):
                 db_catalog_entry.engine, **db_catalog_entry.params
             ) as handler:
                 # Assemble columns.
-                column_df = handler.get_columns(table_name).data
+                response = handler.get_columns(table_name)
+                if response.error is not None:
+                    raise BinderError(response.error)
+                column_df = response.data
                 table_obj = create_table_catalog_entry_for_data_source(
                     table_name, database_name, column_df
                 )

diff --git a/evadb/catalog/catalog_manager.py b/evadb/catalog/catalog_manager.py
@@ -192,7 +192,7 @@ def check_native_table_exists(self, table_name: str, database_name: str):
             resp = handler.get_tables()
 
             if resp.error is not None:
-                return False
+                raise Exception(resp.error)
 
             # Check table existence.
             table_df = resp.data

diff --git a/evadb/storage/native_storage_engine.py b/evadb/storage/native_storage_engine.py
@@ -158,46 +158,41 @@ def write(self, table: TableCatalogEntry, rows: Batch):
             logger.exception(err_msg)
             raise Exception(err_msg)
 
-    def read(self, table: TableCatalogEntry) -> Iterator[Batch]:
+    def read(
+        self, table: TableCatalogEntry, batch_mem_size: int = 30000000
+    ) -> Iterator[Batch]:
         try:
             db_catalog_entry = self._get_database_catalog_entry(table.database_name)
             with get_database_handler(
                 db_catalog_entry.engine, **db_catalog_entry.params
             ) as handler:
-                uri = handler.get_sqlalchmey_uri()
-
-            # Create a metadata object
-            engine = create_engine(uri)
-            metadata = MetaData()
-
-            Session = sessionmaker(bind=engine)
-            session = Session()
-            # Retrieve the SQLAlchemy table object for the existing table
-            table_to_read = Table(table.name, metadata, autoload_with=engine)
-            result = session.execute(table_to_read.select()).fetchall()
-            data_batch = []
-
-            # Ensure that the order of columns in the select is same as in table.columns
-            # Also verify if the column names are consistent
-            if result:
-                cols = result[0]._fields
-                index_dict = {
-                    element.lower(): index for index, element in enumerate(cols)
-                }
-                try:
-                    ordered_columns = sorted(
-                        table.columns, key=lambda x: index_dict[x.name.lower()]
+                handler_response = handler.select(table.name)
+                # we prefer the generator/iterator when available
+                result = []
+                if handler_response.data_generator:
+                    result = handler_response.data_generator
+                elif handler_response.data:
+                    result = handler_response.data
+
+                if handler.is_sqlalchmey_compatible():
+                    # For sql data source, we can deserialize sql rows into numpy array
+                    cols = result[0]._fields
+                    index_dict = {
+                        element.lower(): index for index, element in enumerate(cols)
+                    }
+                    try:
+                        ordered_columns = sorted(
+                            table.columns, key=lambda x: index_dict[x.name.lower()]
+                        )
+                    except KeyError as e:
+                        raise Exception(f"Column mismatch with error {e}")
+                    result = (
+                        _deserialize_sql_row(row, ordered_columns) for row in result
                     )
-                except KeyError as e:
-                    raise Exception(f"Column mismatch with error {e}")
 
-            for row in result:
-                data_batch.append(_deserialize_sql_row(row, ordered_columns))
+                for data_batch in result:
+                    yield Batch(pd.DataFrame([data_batch]))
 
-            if data_batch:
-                yield Batch(pd.DataFrame(data_batch))
-
-            session.close()
         except Exception as e:
             err_msg = f"Failed to read the table {table.name} in data source {table.database_name} with exception {str(e)}"
             logger.exception(err_msg)

diff --git a/evadb/storage/sqlite_storage_engine.py b/evadb/storage/sqlite_storage_engine.py
@@ -29,7 +29,7 @@
 from evadb.models.storage.batch import Batch
 from evadb.parser.table_ref import TableInfo
 from evadb.storage.abstract_storage_engine import AbstractStorageEngine
-from evadb.utils.generic_utils import PickleSerializer, get_size
+from evadb.utils.generic_utils import PickleSerializer
 from evadb.utils.logging_manager import logger
 
 # Leveraging Dynamic schema in SQLAlchemy
@@ -189,23 +189,12 @@ def read(
         try:
             table_to_read = self._try_loading_table_via_reflection(table.name)
             result = self._sql_session.execute(table_to_read.select()).fetchall()
-            data_batch = []
-            row_size = None
             for row in result:
-                # For table read, we provide row_id so that user can also retrieve
-                # row_id from the table.
-                data_batch.append(
-                    self._deserialize_sql_row(row._asdict(), table.columns)
+                yield Batch(
+                    pd.DataFrame(
+                        [self._deserialize_sql_row(row._asdict(), table.columns)]
+                    )
                 )
-                if row_size is None:
-                    row_size = 0
-                    row_size = get_size(data_batch)
-                if len(data_batch) * row_size >= batch_mem_size:
-                    yield Batch(pd.DataFrame(data_batch))
-                    data_batch = []
-            if data_batch:
-                yield Batch(pd.DataFrame(data_batch))
-
         except Exception as e:
             err_msg = f"Failed to read the table {table.name} with exception {str(e)}"
             logger.exception(err_msg)

diff --git a/evadb/third_party/databases/github/__init__.py b/evadb/third_party/databases/github/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""github integration"""