From 132438f29aa6eac2c362e0a035740f68a6dea807 Mon Sep 17 00:00:00 2001 From: Jineet Desai Date: Tue, 24 Oct 2023 23:52:29 -0400 Subject: [PATCH] Adding colab notebook for XGBoost Regression and Classification --- tutorials/19-XGBoost-prediction.ipynb | 2829 +++++++++++++++++++++++++ 1 file changed, 2829 insertions(+) create mode 100644 tutorials/19-XGBoost-prediction.ipynb diff --git a/tutorials/19-XGBoost-prediction.ipynb b/tutorials/19-XGBoost-prediction.ipynb new file mode 100644 index 0000000000..e82b8440a1 --- /dev/null +++ b/tutorials/19-XGBoost-prediction.ipynb @@ -0,0 +1,2829 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Training Prediction Models Directly Within PostgreSQL Using XGBoost EvaDB\n", + "In this tutorial, we'll harness EvaDB's model training capabilities to predict home rental prices, showcasing how EvaDB seamlessly integrates AI into your PostgreSQL database.\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " Run on Google Colab\n", + " \n", + " View source on GitHub\n", + " \n", + " Download notebook\n", + "


" + ], + "metadata": { + "id": "4o38TEFPWmZZ" + } + }, + { + "cell_type": "code", + "source": [ + "!apt -qq install postgresql\n", + "!service postgresql start" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0LgHQ_6J7sPs", + "outputId": "476e56e2-f79f-4baf-b9fb-bbec8c6c6a8a" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "postgresql is already the newest version (14+238).\n", + "0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.\n", + " * Starting PostgreSQL 14 database server\n", + " ...done.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup" + ], + "metadata": { + "id": "nsyZe8PmZYZ7" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install and Launch the PostgreSQL Server\n", + "\n", + "To kick things off, we'll start by setting up the PostgreSQL database backend. If you already have a PostgreSQL server up and running, you can skip this step and proceed directly to [installing EvaDB](#install-evadb)." + ], + "metadata": { + "id": "i1sYj0bRZf6m" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Create User and Database" + ], + "metadata": { + "id": "FORYt3tWZpM1" + } + }, + { + "cell_type": "code", + "source": [ + "!sudo -u postgres psql -c \"CREATE USER eva WITH SUPERUSER PASSWORD 'password'\"\n", + "!sudo -u postgres psql -c \"CREATE DATABASE evadb\"" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ngJzr2B77zF2", + "outputId": "572c6239-1c5e-4f9d-b1bf-cf6e557d5c3b" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ERROR: role \"eva\" already exists\n", + "ERROR: database \"evadb\" already exists\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Prettify Output" + ], + "metadata": { + "id": "uQh_rSIGZvNv" + } + }, + { + "cell_type": "code", + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from IPython.core.display import display, HTML\n", + "def pretty_print(df):\n", + " return display(HTML( df.to_html().replace(\"\\\\n\",\"
\")))" + ], + "metadata": { + "id": "T8sDqf9870dh" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Installing EvaDB and XGBoost dependencies\n", + "\n", + "We install EvaDB along with the necessary PostgreSQL and XGBoost dependencies." + ], + "metadata": { + "id": "cgkNFXEnZ1jI" + } + }, + { + "cell_type": "code", + "source": [ + "%pip install --quiet \"evadb[postgres,xgboost]\"\n", + "\n", + "import evadb\n", + "cursor = evadb.connect().cursor()" + ], + "metadata": { + "id": "Kf2xgkvr76bZ" + }, + "execution_count": 39, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Load data into PostgresSQL" + ], + "metadata": { + "id": "gtzlHzn0aC3F" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Setting up a Data Source in EvaDB\n", + "To establish a direct connection between EvaDB and underlying database systems such as PostgreSQL, we will create a data source. This process entails supplying EvaDB with the connection credentials for the active PostgreSQL server." + ], + "metadata": { + "id": "eHUbzlvXaZIC" + } + }, + { + "cell_type": "code", + "source": [ + "params = {\n", + " \"user\": \"eva\",\n", + " \"password\": \"password\",\n", + " \"host\": \"localhost\",\n", + " \"port\": \"5432\",\n", + " \"database\": \"evadb\",\n", + "}\n", + "query = f\"CREATE DATABASE postgres_data WITH ENGINE = 'postgres', PARAMETERS = {params};\"\n", + "cursor.query(query).df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 640 + }, + "id": "sE_Q8-mM795g", + "outputId": "8271de29-2cd6-4d33-ea5f-eaea9c12e28b" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "10-25-2023 03:09:30 ERROR [plan_executor:plan_executor.py:execute_plan:0179] postgres_data already exists.\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.10/dist-packages/evadb/executor/plan_executor.py\", line 175, in execute_plan\n", + " yield from output\n", + " File \"/usr/local/lib/python3.10/dist-packages/evadb/executor/create_database_executor.py\", line 42, in exec\n", + " raise ExecutorError(f\"{self.node.database_name} already exists.\")\n", + "evadb.executor.executor_utils.ExecutorError: postgres_data already exists.\n", + "ERROR:evadb.utils.logging_manager:postgres_data already exists.\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.10/dist-packages/evadb/executor/plan_executor.py\", line 175, in execute_plan\n", + " yield from output\n", + " File \"/usr/local/lib/python3.10/dist-packages/evadb/executor/create_database_executor.py\", line 42, in exec\n", + " raise ExecutorError(f\"{self.node.database_name} already exists.\")\n", + "evadb.executor.executor_utils.ExecutorError: postgres_data already exists.\n" + ] + }, + { + "output_type": "error", + "ename": "ExecutorError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mExecutorError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/executor/plan_executor.py\u001b[0m in \u001b[0;36mexecute_plan\u001b[0;34m(self, do_not_raise_exceptions, do_not_print_exceptions)\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/executor/create_database_executor.py\u001b[0m in \u001b[0;36mexec\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 42\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutorError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{self.node.database_name} already exists.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 43\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mExecutorError\u001b[0m: postgres_data already exists.", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mExecutorError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m }\n\u001b[1;32m 8\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"CREATE DATABASE postgres_data WITH ENGINE = 'postgres', PARAMETERS = {params};\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mcursor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/interfaces/relational/relation.py\u001b[0m in \u001b[0;36mdf\u001b[0;34m(self, drop_alias)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;36m5\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m \"\"\"\n\u001b[0;32m--> 123\u001b[0;31m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop_alias\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdrop_alias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 124\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframes\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"relation execute failed\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/interfaces/relational/relation.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, drop_alias)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;34m>>\u001b[0m\u001b[0;34m>\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcursor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"SELECT * FROM MyTable;\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m \"\"\"\n\u001b[0;32m--> 141\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexecute_statement\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evadb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_query_node\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;31m# TODO: this is a dirty implementation. Ideally this should be done in the final projection.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdrop_alias\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/server/command_handler.py\u001b[0m in \u001b[0;36mexecute_statement\u001b[0;34m(evadb, stmt, do_not_raise_exceptions, do_not_print_exceptions, **kwargs)\u001b[0m\n\u001b[1;32m 51\u001b[0m )\n\u001b[1;32m 52\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m \u001b[0mbatch_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mBatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/evadb/executor/plan_executor.py\u001b[0m in \u001b[0;36mexecute_plan\u001b[0;34m(self, do_not_raise_exceptions, do_not_print_exceptions)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdo_not_print_exceptions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutorError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mExecutorError\u001b[0m: postgres_data already exists." + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Home Property Sales Data from CSV into PostgreSQL\n", + "\n", + "In this step, we will import the [House Property Sales](https://www.kaggle.com/datasets/htagholdings/property-sales?resource=download) dataset into our PostgreSQL database. If you already have the data stored in PostgreSQL and are ready to proceed with the prediction model training, feel free to skip this section and head directly to the [model training process](#train-the-prediction-model)." + ], + "metadata": { + "id": "o0fkC5Z2ahK9" + } + }, + { + "cell_type": "code", + "source": [ + "!mkdir -p content\n", + "!wget -nc -O /content/home_rentals.csv https://www.dropbox.com/scl/fi/gy2682i66a8l2tqsowm5x/home_rentals.csv?rlkey=e080k02rv5205h4ullfjdr8lw&raw=1" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iVJW8pPd8ERb", + "outputId": "1cd15039-0d1a-4124-f4b7-ee450d7530c6" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "File ‘/content/home_rentals.csv’ already there; not retrieving.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " USE postgres_data {\n", + " CREATE TABLE IF NOT EXISTS home_rentals (\n", + " number_of_rooms INT,\n", + " number_of_bathrooms INT,\n", + " sqft INT,\n", + " location VARCHAR(128),\n", + " days_on_market INT,\n", + " initial_price INT,\n", + " neighborhood VARCHAR(128),\n", + " rental_price FLOAT\n", + " )\n", + " }\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "Gcjhxs8a8I1V", + "outputId": "e16b0410-5f9c-4b90-e5df-ab58b88813a1" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " status\n", + "0 success" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " USE postgres_data {\n", + " COPY home_rentals(number_of_rooms, number_of_bathrooms, sqft, location, days_on_market, initial_price, neighborhood, rental_price)\n", + " FROM '/content/home_rentals.csv'\n", + " DELIMITER ',' CSV HEADER\n", + " }\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "SK6b1xdP9kIR", + "outputId": "b8d0705f-9ebe-43b5-f068-729dd869ce16" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " status\n", + "0 success" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Preview the Data" + ], + "metadata": { + "id": "Mju7NS_sarR1" + } + }, + { + "cell_type": "markdown", + "source": [ + "Within the home_rentals table, there are 8 columns at our disposal. Our objective is to utilize the remaining 7 columns to make predictions for the rental_price." + ], + "metadata": { + "id": "jA1sc4eQaxzA" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"SELECT * FROM postgres_data.home_rentals LIMIT 3;\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + }, + "id": "TRzw1K-b8qSO", + "outputId": "f72fa1a1-cc19-4eae-bcbf-f11fdcb457a6" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " rental_price number_of_bathrooms sqft initial_price number_of_rooms \\\n", + "0 2167.0 1 674 2167 1 \n", + "1 1883.0 1 554 1883 1 \n", + "2 2431.0 1 529 2431 0 \n", + "\n", + " days_on_market location neighborhood \n", + "0 1 good downtown \n", + "1 19 poor westbrae \n", + "2 3 great south_side " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_pricenumber_of_bathroomssqftinitial_pricenumber_of_roomsdays_on_marketlocationneighborhood
02167.01674216711gooddowntown
11883.015541883119poorwestbrae
22431.01529243103greatsouth_side
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Training Model\n", + "\n", + "Next, we employ EvaDB to facilitate the training of an ML model, which will enable us to predict `home rental prices`." + ], + "metadata": { + "id": "aQsTNqyha5Yi" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Train the prediction Model\n", + "For this purpose, we harness the capabilities of the [xgboost](https://xgboost.readthedocs.io/en/stable/) engine to train our prediction model. We employ the `Flaml` feature to automatically determine the optimal hyperparameters. Keep in mind that `TIME_LIMIT` specifies the time budget allocated for the training process. `METRIC` specifies the training error or accuracy you want to optimize on while training. `TASK` specifies whether you aim to perform classification or regression. In this example we shall use regression to predict home rental price." + ], + "metadata": { + "id": "haHlsnpva-Xt" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " CREATE OR REPLACE FUNCTION PredictHouseRent FROM\n", + " ( SELECT * FROM postgres_data.home_rentals )\n", + " TYPE Xgboost\n", + " PREDICT 'rental_price'\n", + " METRIC 'rmse'\n", + " TASK 'regression'\n", + " TIME_LIMIT 180;\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "QjXDii0Q9oDi", + "outputId": "c82d1fa3-4179-4783-c6f8-5d32433f6e28" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[flaml.automl.logger: 10-25 03:09:57] {1679} INFO - task = regression\n", + "[flaml.automl.logger: 10-25 03:09:57] {1690} INFO - Evaluation method: cv\n", + "[flaml.automl.logger: 10-25 03:09:57] {1788} INFO - Minimizing error metric: rmse\n", + "[flaml.automl.logger: 10-25 03:09:57] {1900} INFO - List of ML learners in AutoML Run: ['xgboost']\n", + "[flaml.automl.logger: 10-25 03:09:57] {2218} INFO - iteration 0, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:57] {2344} INFO - Estimated sufficient time budget=1949s. Estimated necessary time budget=2s.\n", + "[flaml.automl.logger: 10-25 03:09:57] {2391} INFO - at 0.3s,\testimator xgboost's best error=873.9536,\tbest estimator xgboost's best error=873.9536\n", + "[flaml.automl.logger: 10-25 03:09:57] {2218} INFO - iteration 1, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:57] {2391} INFO - at 0.5s,\testimator xgboost's best error=873.9536,\tbest estimator xgboost's best error=873.9536\n", + "[flaml.automl.logger: 10-25 03:09:57] {2218} INFO - iteration 2, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:57] {2391} INFO - at 0.7s,\testimator xgboost's best error=442.5653,\tbest estimator xgboost's best error=442.5653\n", + "[flaml.automl.logger: 10-25 03:09:57] {2218} INFO - iteration 3, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:57] {2391} INFO - at 0.9s,\testimator xgboost's best error=169.0038,\tbest estimator xgboost's best error=169.0038\n", + "[flaml.automl.logger: 10-25 03:09:57] {2218} INFO - iteration 4, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:58] {2391} INFO - at 1.1s,\testimator xgboost's best error=169.0038,\tbest estimator xgboost's best error=169.0038\n", + "[flaml.automl.logger: 10-25 03:09:58] {2218} INFO - iteration 5, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:58] {2391} INFO - at 1.3s,\testimator xgboost's best error=169.0038,\tbest estimator xgboost's best error=169.0038\n", + "[flaml.automl.logger: 10-25 03:09:58] {2218} INFO - iteration 6, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:58] {2391} INFO - at 1.5s,\testimator xgboost's best error=90.2588,\tbest estimator xgboost's best error=90.2588\n", + "[flaml.automl.logger: 10-25 03:09:58] {2218} INFO - iteration 7, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:58] {2391} INFO - at 1.7s,\testimator xgboost's best error=90.2588,\tbest estimator xgboost's best error=90.2588\n", + "[flaml.automl.logger: 10-25 03:09:58] {2218} INFO - iteration 8, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:58] {2391} INFO - at 2.0s,\testimator xgboost's best error=90.2588,\tbest estimator xgboost's best error=90.2588\n", + "[flaml.automl.logger: 10-25 03:09:58] {2218} INFO - iteration 9, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:59] {2391} INFO - at 2.2s,\testimator xgboost's best error=37.7698,\tbest estimator xgboost's best error=37.7698\n", + "[flaml.automl.logger: 10-25 03:09:59] {2218} INFO - iteration 10, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:59] {2391} INFO - at 2.5s,\testimator xgboost's best error=37.7698,\tbest estimator xgboost's best error=37.7698\n", + "[flaml.automl.logger: 10-25 03:09:59] {2218} INFO - iteration 11, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:59] {2391} INFO - at 2.7s,\testimator xgboost's best error=37.7698,\tbest estimator xgboost's best error=37.7698\n", + "[flaml.automl.logger: 10-25 03:09:59] {2218} INFO - iteration 12, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:09:59] {2391} INFO - at 3.0s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:09:59] {2218} INFO - iteration 13, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:00] {2391} INFO - at 3.2s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:10:00] {2218} INFO - iteration 14, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:00] {2391} INFO - at 3.5s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:10:00] {2218} INFO - iteration 15, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:00] {2391} INFO - at 3.7s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:10:00] {2218} INFO - iteration 16, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:00] {2391} INFO - at 4.0s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:10:00] {2218} INFO - iteration 17, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:01] {2391} INFO - at 4.2s,\testimator xgboost's best error=22.4973,\tbest estimator xgboost's best error=22.4973\n", + "[flaml.automl.logger: 10-25 03:10:01] {2218} INFO - iteration 18, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:01] {2391} INFO - at 4.5s,\testimator xgboost's best error=14.6917,\tbest estimator xgboost's best error=14.6917\n", + "[flaml.automl.logger: 10-25 03:10:01] {2218} INFO - iteration 19, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:01] {2391} INFO - at 4.7s,\testimator xgboost's best error=14.6917,\tbest estimator xgboost's best error=14.6917\n", + "[flaml.automl.logger: 10-25 03:10:01] {2218} INFO - iteration 20, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:02] {2391} INFO - at 5.9s,\testimator xgboost's best error=11.9246,\tbest estimator xgboost's best error=11.9246\n", + "[flaml.automl.logger: 10-25 03:10:02] {2218} INFO - iteration 21, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:03] {2391} INFO - at 6.8s,\testimator xgboost's best error=11.6165,\tbest estimator xgboost's best error=11.6165\n", + "[flaml.automl.logger: 10-25 03:10:03] {2218} INFO - iteration 22, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:04] {2391} INFO - at 8.0s,\testimator xgboost's best error=11.6165,\tbest estimator xgboost's best error=11.6165\n", + "[flaml.automl.logger: 10-25 03:10:04] {2218} INFO - iteration 23, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:05] {2391} INFO - at 8.6s,\testimator xgboost's best error=11.6165,\tbest estimator xgboost's best error=11.6165\n", + "[flaml.automl.logger: 10-25 03:10:05] {2218} INFO - iteration 24, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:06] {2391} INFO - at 9.8s,\testimator xgboost's best error=11.6165,\tbest estimator xgboost's best error=11.6165\n", + "[flaml.automl.logger: 10-25 03:10:06] {2218} INFO - iteration 25, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:07] {2391} INFO - at 10.1s,\testimator xgboost's best error=11.6165,\tbest estimator xgboost's best error=11.6165\n", + "[flaml.automl.logger: 10-25 03:10:07] {2218} INFO - iteration 26, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:09] {2391} INFO - at 12.6s,\testimator xgboost's best error=9.9683,\tbest estimator xgboost's best error=9.9683\n", + "[flaml.automl.logger: 10-25 03:10:09] {2218} INFO - iteration 27, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:13] {2391} INFO - at 16.5s,\testimator xgboost's best error=8.1435,\tbest estimator xgboost's best error=8.1435\n", + "[flaml.automl.logger: 10-25 03:10:13] {2218} INFO - iteration 28, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:16] {2391} INFO - at 19.5s,\testimator xgboost's best error=8.1435,\tbest estimator xgboost's best error=8.1435\n", + "[flaml.automl.logger: 10-25 03:10:16] {2218} INFO - iteration 29, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:17] {2391} INFO - at 20.1s,\testimator xgboost's best error=8.1435,\tbest estimator xgboost's best error=8.1435\n", + "[flaml.automl.logger: 10-25 03:10:17] {2218} INFO - iteration 30, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:29] {2391} INFO - at 32.6s,\testimator xgboost's best error=7.4456,\tbest estimator xgboost's best error=7.4456\n", + "[flaml.automl.logger: 10-25 03:10:29] {2218} INFO - iteration 31, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:36] {2391} INFO - at 39.1s,\testimator xgboost's best error=7.4456,\tbest estimator xgboost's best error=7.4456\n", + "[flaml.automl.logger: 10-25 03:10:36] {2218} INFO - iteration 32, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:43] {2391} INFO - at 46.8s,\testimator xgboost's best error=7.4456,\tbest estimator xgboost's best error=7.4456\n", + "[flaml.automl.logger: 10-25 03:10:43] {2218} INFO - iteration 33, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:10:55] {2391} INFO - at 58.4s,\testimator xgboost's best error=7.4456,\tbest estimator xgboost's best error=7.4456\n", + "[flaml.automl.logger: 10-25 03:10:55] {2218} INFO - iteration 34, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:11:03] {2391} INFO - at 66.3s,\testimator xgboost's best error=5.6694,\tbest estimator xgboost's best error=5.6694\n", + "[flaml.automl.logger: 10-25 03:11:03] {2218} INFO - iteration 35, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:11:47] {2391} INFO - at 110.2s,\testimator xgboost's best error=5.1787,\tbest estimator xgboost's best error=5.1787\n", + "[flaml.automl.logger: 10-25 03:11:47] {2218} INFO - iteration 36, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:11:55] {2391} INFO - at 118.2s,\testimator xgboost's best error=5.1787,\tbest estimator xgboost's best error=5.1787\n", + "[flaml.automl.logger: 10-25 03:11:55] {2218} INFO - iteration 37, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:00] {2391} INFO - at 183.9s,\testimator xgboost's best error=4.5479,\tbest estimator xgboost's best error=4.5479\n", + "[flaml.automl.logger: 10-25 03:13:15] {2627} INFO - retrain xgboost for 14.3s\n", + "[flaml.automl.logger: 10-25 03:13:15] {2630} INFO - retrained model: XGBRegressor(base_score=None, booster=None, callbacks=[],\n", + " colsample_bylevel=0.9275689307086866, colsample_bynode=None,\n", + " colsample_bytree=0.9639360747179713, device=None,\n", + " early_stopping_rounds=None, enable_categorical=False,\n", + " eval_metric=None, feature_types=None, gamma=None,\n", + " grow_policy='lossguide', importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.15910240171256276,\n", + " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=0, max_leaves=460,\n", + " min_child_weight=2.1211909508723865, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=1264,\n", + " n_jobs=-1, num_parallel_tree=None, random_state=None, ...)\n", + "[flaml.automl.logger: 10-25 03:13:15] {1930} INFO - fit succeeded\n", + "[flaml.automl.logger: 10-25 03:13:15] {1931} INFO - Time taken to find the best model: 183.9464476108551\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0\n", + "0 Function PredictHouseRent overwritten." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0Function PredictHouseRent overwritten.
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Example training query using the `R2` metric" + ], + "metadata": { + "id": "HvBfd4wdeJTj" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " CREATE OR REPLACE FUNCTION PredictHouseRent FROM\n", + " ( SELECT * FROM postgres_data.home_rentals )\n", + " TYPE Xgboost\n", + " PREDICT 'rental_price'\n", + " METRIC 'r2'\n", + " TASK 'regression'\n", + " TIME_LIMIT 120;\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "hX2E1263eDgh", + "outputId": "05c06a93-38ae-4b56-c55e-8f3c16b34dbd" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[flaml.automl.logger: 10-25 03:13:21] {1679} INFO - task = regression\n", + "[flaml.automl.logger: 10-25 03:13:21] {1690} INFO - Evaluation method: cv\n", + "[flaml.automl.logger: 10-25 03:13:21] {1788} INFO - Minimizing error metric: 1-r2\n", + "[flaml.automl.logger: 10-25 03:13:21] {1900} INFO - List of ML learners in AutoML Run: ['xgboost']\n", + "[flaml.automl.logger: 10-25 03:13:21] {2218} INFO - iteration 0, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:21] {2344} INFO - Estimated sufficient time budget=1994s. Estimated necessary time budget=2s.\n", + "[flaml.automl.logger: 10-25 03:13:21] {2391} INFO - at 0.3s,\testimator xgboost's best error=0.4579,\tbest estimator xgboost's best error=0.4579\n", + "[flaml.automl.logger: 10-25 03:13:21] {2218} INFO - iteration 1, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:22] {2391} INFO - at 0.5s,\testimator xgboost's best error=0.4579,\tbest estimator xgboost's best error=0.4579\n", + "[flaml.automl.logger: 10-25 03:13:22] {2218} INFO - iteration 2, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:22] {2391} INFO - at 0.7s,\testimator xgboost's best error=0.1174,\tbest estimator xgboost's best error=0.1174\n", + "[flaml.automl.logger: 10-25 03:13:22] {2218} INFO - iteration 3, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:22] {2391} INFO - at 0.9s,\testimator xgboost's best error=0.0171,\tbest estimator xgboost's best error=0.0171\n", + "[flaml.automl.logger: 10-25 03:13:22] {2218} INFO - iteration 4, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:22] {2391} INFO - at 1.1s,\testimator xgboost's best error=0.0171,\tbest estimator xgboost's best error=0.0171\n", + "[flaml.automl.logger: 10-25 03:13:22] {2218} INFO - iteration 5, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:22] {2391} INFO - at 1.3s,\testimator xgboost's best error=0.0171,\tbest estimator xgboost's best error=0.0171\n", + "[flaml.automl.logger: 10-25 03:13:22] {2218} INFO - iteration 6, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 1.6s,\testimator xgboost's best error=0.0049,\tbest estimator xgboost's best error=0.0049\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 7, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 1.8s,\testimator xgboost's best error=0.0049,\tbest estimator xgboost's best error=0.0049\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 8, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 1.9s,\testimator xgboost's best error=0.0049,\tbest estimator xgboost's best error=0.0049\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 9, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 2.1s,\testimator xgboost's best error=0.0009,\tbest estimator xgboost's best error=0.0009\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 10, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 2.3s,\testimator xgboost's best error=0.0009,\tbest estimator xgboost's best error=0.0009\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 11, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:23] {2391} INFO - at 2.4s,\testimator xgboost's best error=0.0009,\tbest estimator xgboost's best error=0.0009\n", + "[flaml.automl.logger: 10-25 03:13:23] {2218} INFO - iteration 12, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:24] {2391} INFO - at 2.7s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:24] {2218} INFO - iteration 13, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:24] {2391} INFO - at 2.8s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:24] {2218} INFO - iteration 14, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:24] {2391} INFO - at 3.2s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:24] {2218} INFO - iteration 15, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:24] {2391} INFO - at 3.3s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:24] {2218} INFO - iteration 16, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:25] {2391} INFO - at 3.6s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:25] {2218} INFO - iteration 17, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:25] {2391} INFO - at 3.8s,\testimator xgboost's best error=0.0003,\tbest estimator xgboost's best error=0.0003\n", + "[flaml.automl.logger: 10-25 03:13:25] {2218} INFO - iteration 18, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:25] {2391} INFO - at 4.2s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:25] {2218} INFO - iteration 19, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:25] {2391} INFO - at 4.4s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:25] {2218} INFO - iteration 20, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:27] {2391} INFO - at 5.6s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:27] {2218} INFO - iteration 21, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:27] {2391} INFO - at 6.5s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:27] {2218} INFO - iteration 22, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:29] {2391} INFO - at 7.7s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:29] {2218} INFO - iteration 23, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:29] {2391} INFO - at 8.4s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:29] {2218} INFO - iteration 24, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:31] {2391} INFO - at 9.6s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:31] {2218} INFO - iteration 25, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:31] {2391} INFO - at 9.9s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:31] {2218} INFO - iteration 26, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:34] {2391} INFO - at 12.9s,\testimator xgboost's best error=0.0001,\tbest estimator xgboost's best error=0.0001\n", + "[flaml.automl.logger: 10-25 03:13:34] {2218} INFO - iteration 27, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:38] {2391} INFO - at 16.8s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:13:38] {2218} INFO - iteration 28, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:40] {2391} INFO - at 19.4s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:13:40] {2218} INFO - iteration 29, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:41] {2391} INFO - at 20.1s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:13:41] {2218} INFO - iteration 30, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:13:56] {2391} INFO - at 34.6s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:13:56] {2218} INFO - iteration 31, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:14:02] {2391} INFO - at 41.0s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:14:02] {2218} INFO - iteration 32, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:14:10] {2391} INFO - at 48.7s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:14:10] {2218} INFO - iteration 33, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:14:21] {2391} INFO - at 60.4s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:14:21] {2218} INFO - iteration 34, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:14:29] {2391} INFO - at 68.1s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:14:29] {2218} INFO - iteration 35, current learner xgboost\n", + "[flaml.automl.logger: 10-25 03:15:13] {2391} INFO - at 112.3s,\testimator xgboost's best error=0.0000,\tbest estimator xgboost's best error=0.0000\n", + "[flaml.automl.logger: 10-25 03:15:23] {2627} INFO - retrain xgboost for 9.3s\n", + "[flaml.automl.logger: 10-25 03:15:23] {2630} INFO - retrained model: XGBRegressor(base_score=None, booster=None, callbacks=[],\n", + " colsample_bylevel=0.9025517915891937, colsample_bynode=None,\n", + " colsample_bytree=0.8696723552623628, device=None,\n", + " early_stopping_rounds=None, enable_categorical=False,\n", + " eval_metric=None, feature_types=None, gamma=None,\n", + " grow_policy='lossguide', importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.1823033077636653,\n", + " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=0, max_leaves=142,\n", + " min_child_weight=14.890670283201343, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=2015,\n", + " n_jobs=-1, num_parallel_tree=None, random_state=None, ...)\n", + "[flaml.automl.logger: 10-25 03:15:23] {1930} INFO - fit succeeded\n", + "[flaml.automl.logger: 10-25 03:15:23] {1931} INFO - Time taken to find the best model: 112.30713248252869\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0\n", + "0 Function PredictHouseRent overwritten." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0Function PredictHouseRent overwritten.
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Utilizing the Prediction Model\n", + "Following the model training, we proceed to employ the `PredictHouseRent`` model to make predictions for home rental prices." + ], + "metadata": { + "id": "clg31P6vbpvc" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"SELECT PredictHouseRent(*) FROM postgres_data.home_rentals LIMIT 10;\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + }, + "id": "9ZovWvhSBhAV", + "outputId": "1adb172a-6b27-4f3f-cabe-2622c858d9a3" + }, + "execution_count": 47, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " rental_price\n", + "0 2152.748779\n", + "1 1940.956787\n", + "2 2438.577881\n", + "3 5532.644043\n", + "4 2270.967529\n", + "5 4169.572754\n", + "6 2207.939941\n", + "7 2101.080566\n", + "8 3873.550049\n", + "9 2027.248047" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_price
02152.748779
11940.956787
22438.577881
35532.644043
42270.967529
54169.572754
62207.939941
72101.080566
83873.550049
92027.248047
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 47 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We have the option to utilize a `LATERAL JOIN` to compare the actual rental prices in the `home_rentals` dataset with the predicted rental prices generated by the trained model, `PredictHouseRent`." + ], + "metadata": { + "id": "aLr0TmExbvdC" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " SELECT rental_price, predicted_rental_price FROM postgres_data.home_rentals\n", + " JOIN LATERAL PredictHouseRent(*) AS Predicted(predicted_rental_price) LIMIT 10;\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + }, + "id": "lKx3_ExZBoK6", + "outputId": "0e874f2e-3156-4cd5-b12f-8b7416b8eece" + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " rental_price predicted_rental_price\n", + "0 2167.000 2166.888184\n", + "1 1883.000 1882.995605\n", + "2 2431.000 2430.979492\n", + "3 5510.000 5510.027832\n", + "4 2272.000 2272.018066\n", + "5 4123.812 4123.840820\n", + "6 2224.000 2223.957275\n", + "7 2104.000 2103.984131\n", + "8 3861.000 3860.960449\n", + "9 2041.000 2041.064087" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_pricepredicted_rental_price
02167.0002166.888184
11883.0001882.995605
22431.0002430.979492
35510.0005510.027832
42272.0002272.018066
54123.8124123.840820
62224.0002223.957275
72104.0002103.984131
83861.0003860.960449
92041.0002041.064087
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Training Classification Models using EVADB\n", + "\n", + "Next, we employ EvaDB to facilitate the training of Classification ML models, which will enable us to predict `leave_or_not` i.e. a variable depicting whether an employee will leave the current company or not based on several parameters." + ], + "metadata": { + "id": "ZDVPDGqjfrch" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Employee Data from CSV into PostgreSQL\n", + "\n", + "In this step, we will import the [Employee Data](https://www.kaggle.com/datasets/tawfikelmetwally/employee-dataset) dataset into our PostgreSQL database. If you already have the data stored in PostgreSQL and are ready to proceed with the prediction model training, feel free to skip this section and head directly to the [model training process](#train-the-prediction-model)." + ], + "metadata": { + "id": "ODq5QPC3gp5U" + } + }, + { + "cell_type": "code", + "source": [ + "!mkdir -p content\n", + "!wget -nc -O /content/Employee.csv https://drive.google.com/file/d/1R4ij5Ww6bOGwLJrbBStzcaPRhAJ-fn72/view?usp=share_link" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MlJ4adTuiDUF", + "outputId": "f5f7c292-954b-4a0c-8b1f-44d3ea1f7102" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "File ‘/content/Employee.csv’ already there; not retrieving.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " USE postgres_data {\n", + " CREATE TABLE IF NOT EXISTS Employee (\n", + " education VARCHAR(128),\n", + " joining_year INTEGER,\n", + " city VARCHAR(128),\n", + " payment_tier INTEGER,\n", + " age INTEGER,\n", + " gender VARCHAR(128),\n", + " ever_benched VARCHAR(128),\n", + " experience_in_current_domain INTEGER,\n", + " leave_or_not INTEGER\n", + " )\n", + " }\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "g_KW1uc2iNdv", + "outputId": "e8daf7bc-bab6-4a76-d0ca-7f65d22ec38d" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " status\n", + "0 success" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 55 + } + ] + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " USE postgres_data {\n", + " COPY Employee(education, joining_year, city, payment_tier, age, gender, ever_benched, experience_in_current_domain, leave_or_not)\n", + " FROM '/content/Employee.csv'\n", + " DELIMITER ',' CSV HEADER\n", + " }\n", + "\"\"\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "xcnQHGN31b7z", + "outputId": "8d94312d-1871-49e1-94a2-9ee042367621" + }, + "execution_count": 60, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " status\n", + "0 success" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 60 + } + ] + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"SELECT * FROM postgres_data.employee LIMIT 3;\").df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + }, + "id": "Ugj7wEsa43-K", + "outputId": "fb20a3b6-fdb3-4b0e-8a30-0ff0c6b3f0a8" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " leave_or_not joining_year payment_tier age \\\n", + "0 0 2017 3 34 \n", + "1 1 2013 1 28 \n", + "2 0 2014 3 38 \n", + "\n", + " experience_in_current_domain gender city ever_benched education \n", + "0 0 Male Bangalore No Bachelors \n", + "1 3 Female Pune No Bachelors \n", + "2 2 Female New Delhi No Bachelors " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
leave_or_notjoining_yearpayment_tierageexperience_in_current_domaingendercityever_benchededucation
0020173340MaleBangaloreNoBachelors
1120131283FemalePuneNoBachelors
2020143382FemaleNew DelhiNoBachelors
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Train the prediction Model\n", + "Train the XGBoost AutoML model for classification using the `accuracy` metric" + ], + "metadata": { + "id": "1AtF1AtT4OC0" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"\"\"\n", + " CREATE FUNCTION IF NOT EXISTS PredictEmployee FROM\n", + " ( SELECT payment_tier, age, gender, experience_in_current_domain, leave_or_not FROM postgres_data.employee )\n", + " TYPE XGBoost\n", + " PREDICT 'leave_or_not'\n", + " TIME_LIMIT 180\n", + " METRIC 'f1'\n", + " TASK 'classification';\n", + "\"\"\").df()" + ], + "metadata": { + "id": "NUbo47cG33cp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Utilizing the Prediction Model\n", + "Following the model training, we proceed to employ the `PredictEmployee` model to make predictions for whether the employee will leave or not." + ], + "metadata": { + "id": "Fc2eLIEB61Pj" + } + }, + { + "cell_type": "code", + "source": [ + "cursor.query(\"SELECT PredictEmployee(payment_tier, age, gender, experience_in_current_domain, leave_or_not) FROM postgres_data.employee LIMIT 10;\").df()" + ], + "metadata": { + "id": "MzSDNiSt6vGb" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file