Here we will load data of physical measurements of abalones (sea snails). <a href="https://archive.ics.uci.edu/dataset/1/abalone" target="_blank" rel="noopener">Follow this link</a> to get details about this dataset.

To execute queries and upload data to Exasol database we will be using the <a href="https://github.com/exasol/pyexasol" target="_blank" rel="noopener">`pyexasol`</a> module.

Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb).

In [1]:
# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.
from dataclasses import dataclass

@dataclass
class SandboxConfig:
    EXTERNAL_HOST_NAME = "192.168.124.93"
    HOST_PORT = "8888"

    @property
    def EXTERNAL_HOST(self):
        return f"""{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}"""

    USER = "sys"
    PASSWORD = "exasol"
    BUCKETFS_PORT = "6666"
    BUCKETFS_USER = "w"
    BUCKETFS_PASSWORD = "write"
    BUCKETFS_USE_HTTPS = False
    BUCKETFS_SERVICE = "bfsdefault"
    BUCKETFS_BUCKET = "default"

    @property
    def EXTERNAL_BUCKETFS_HOST(self):
        return f"""{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}"""

    @property
    def BUCKETFS_URL_PREFIX(self):
        return "https://" if self.BUCKETFS_USE_HTTPS else "http://"

    @property
    def BUCKETFS_PATH(self):
        # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container
        return f"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}"

    SCRIPT_LANGUAGE_NAME = "PYTHON3_60"
    UDF_FLAVOR = "python3-ds-EXASOL-6.0.0"
    UDF_RELEASE= "20190116"
    UDF_CLIENT = "exaudfclient" # or for newer versions of the flavor exaudfclient_py3
    SCHEMA = "IDA"

    @property
    def SCRIPT_LANGUAGES(self):
        return f"""{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/
            {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/
            {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}""";

    @property
    def connection_params(self):
        return {"dns": self.EXTERNAL_HOST, "user": self.USER, "password": self.PASSWORD, "compression": True}

    @property
    def params(self):
        return {
            "script_languages": self.SCRIPT_LANGUAGES,
            "script_language_name": self.SCRIPT_LANGUAGE_NAME,
            "schema": self.SCHEMA,
            "BUCKETFS_PORT": self.BUCKETFS_PORT,
            "BUCKETFS_USER": self.BUCKETFS_USER,
            "BUCKETFS_PASSWORD": self.BUCKETFS_PASSWORD,
            "BUCKETFS_USE_HTTPS": self.BUCKETFS_USE_HTTPS,
            "BUCKETFS_BUCKET": self.BUCKETFS_BUCKET,
            "BUCKETFS_PATH": self.BUCKETFS_PATH
        }

conf = SandboxConfig()

First we will load the data into pandas DataFrame.
We will name the column as per their description (see Variable Table in the dataset description).

In [2]:
from urllib.request import urlopen
import tempfile
from zipfile import ZipFile
from contextlib import ExitStack
import pandas as pd
from stopwatch import Stopwatch

stopwatch = Stopwatch()

DATA_URL = "https://archive.ics.uci.edu/static/public/1/abalone.zip"
DATA_FILE = "abalone.data"

resp = urlopen(DATA_URL)
with ExitStack() as stack:
    f = stack.enter_context(tempfile.TemporaryFile())
    f.write(resp.read())
    print(f"Downloading the data took: {stopwatch}")

    f.seek(0)
    z = stack.enter_context(ZipFile(f))
    f = stack.enter_context(z.open(DATA_FILE, "r"))
    df = pd.read_csv(f)

column_def = [
    ('Sex', 'CHAR(1)'),	                 # M, F, and I (infant)
    ('Length', 'DECIMAL(4,3)'),          # longest shell measurement (mm)
    ('Diameter', 'DECIMAL(4,3)'),	     # perpendicular to length (mm)
    ('Height', 'DECIMAL(4,3)'),          # with meat in shell (mm)
    ('Whole_weight', 'DECIMAL(5,4)'),    # whole abalone (grams)
    ('Shucked_weight', 'DECIMAL(5,4)'),  # weight of meat (grams)
    ('Viscera_weight', 'DECIMAL(5,4)'),  # gut weight (after bleeding) (grams)
    ('Shell_weight', 'DECIMAL(4,3)'),    # after being dried (grams)
    ('Rings', 'INT')                     # +1.5 gives the age in years
]
df.columns = [name for name, _ in column_def]

print(df.head())

Downloading the data took: 777.70ms
  Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  Viscera_weight  \
0   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
1   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
2   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
3   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   
4   I   0.425     0.300   0.095        0.3515          0.1410          0.0775   

   Shell_weight  Rings  
0         0.070      7  
1         0.210      9  
2         0.155     10  
3         0.055      7  
4         0.120      8  


Let's split data randomly into train and test sets. We will then create two tables - ABALONE_TRAIN and ABALONE_TEST - and load the datasets into these tables.

In [4]:
from sklearn.model_selection import train_test_split
import pyexasol

# Split the data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2)

column_desc = [' '.join(c) for c in column_def]

stopwatch = Stopwatch()

# Create Exasol connection
with pyexasol.connect(dsn=conf.EXTERNAL_HOST, user=conf.USER, password=conf.PASSWORD, compression=True) as conn:

    # Create tables
    sql = f'CREATE OR REPLACE TABLE {{schema!i}}.ABALONE_TRAIN({", ".join(column_desc)})'
    conn.execute(query=sql, query_params=conf.params)
    sql = 'CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN'
    conn.execute(query=sql, query_params=conf.params)

    # Import data into Exasol
    conn.import_from_pandas(df_train, (conf.SCHEMA, "ABALONE_TRAIN"))
    print(f"Imported {conn.last_statement().rowcount()} rows into TRAIN.")
    conn.import_from_pandas(df_test, (conf.SCHEMA, "ABALONE_TEST"))
    print(f"Imported {conn.last_statement().rowcount()} rows into TEST.")

print(f"Importing the data took: {stopwatch}")

Imported 3340 rows into TRAIN.
Imported 836 rows into TEST.
Importing the data took: 434.25ms
