Let's load data simulating registration of high energy gamma particles in an atmospheric Cherenkov telescope. <a href="https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope" target="_blank" rel="noopener">Follow this link</a> to get details about this dataset.

To execute queries and upload data to Exasol database we will be using the <a href="https://github.com/exasol/pyexasol" target="_blank" rel="noopener">`pyexasol`</a> module.

Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb).

In [2]:
# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.
EXASOL_EXTERNAL_HOST_NAME = "192.168.124.93"
EXASOL_HOST_PORT = "8888"
EXASOL_EXTERNAL_HOST = f"""{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_HOST_PORT}"""
EXASOL_USER = "sys"
EXASOL_PASSWORD = "exasol"
EXASOL_BUCKETFS_PORT = "6666"
EXASOL_EXTERNAL_BUCKETFS_HOST = f"""{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_BUCKETFS_PORT}"""
EXASOL_BUCKETFS_USER = "w"
EXASOL_BUCKETFS_PASSWORD = "write"
EXASOL_BUCKETFS_USE_HTTPS = False
EXASOL_BUCKETFS_URL_PREFIX = "https://" if EXASOL_BUCKETFS_USE_HTTPS else "http://"
EXASOL_BUCKETFS_SERVICE = "bfsdefault"
EXASOL_BUCKETFS_BUCKET = "default"
EXASOL_BUCKETFS_PATH = f"/buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container
EXASOL_SCRIPT_LANGUAGE_NAME = "PYTHON3_60"
EXASOL_UDF_FLAVOR = "python3-ds-EXASOL-6.0.0"
EXASOL_UDF_RELEASE= "20190116"
EXASOL_UDF_CLIENT = "exaudfclient" # or for newer versions of the flavor exaudfclient_py3
EXASOL_SCRIPT_LANGUAGES = f"{EXASOL_SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}?lang=python#buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}/exaudf/{EXASOL_UDF_CLIENT}";
EXASOL_SCHEMA = "IDA"

connection_params = {"dns": EXASOL_EXTERNAL_HOST, "user": EXASOL_USER, "password": EXASOL_PASSWORD, "compression": True}

params = {
    "script_languages": EXASOL_SCRIPT_LANGUAGES,
    "script_language_name": EXASOL_SCRIPT_LANGUAGE_NAME,
    "schema": EXASOL_SCHEMA,
    "EXASOL_BUCKETFS_PORT": EXASOL_BUCKETFS_PORT,
    "EXASOL_BUCKETFS_USER": EXASOL_BUCKETFS_USER,
    "EXASOL_BUCKETFS_PASSWORD": EXASOL_BUCKETFS_PASSWORD,
    "EXASOL_BUCKETFS_USE_HTTPS": EXASOL_BUCKETFS_USE_HTTPS,
    "EXASOL_BUCKETFS_BUCKET": EXASOL_BUCKETFS_BUCKET,
    "EXASOL_BUCKETFS_PATH": EXASOL_BUCKETFS_PATH
}

First we will load the data into pandas DataFrame.
We will name the column as per their description (see Additional Variable Information section).

In [3]:
from urllib.request import urlopen
import tempfile
from zipfile import ZipFile
import pandas as pd
from stopwatch import Stopwatch

stopwatch = Stopwatch()

DATA_URL = "https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip"
DATA_FILE = "magic04.data"

resp = urlopen(DATA_URL)
with tempfile.TemporaryFile() as f:
    f.write(resp.read())
    print(f"Downloading the data took: {stopwatch}")

    f.seek(0)
    with ZipFile(f) as z:
        with z.open(DATA_FILE, "r") as f:
            df = pd.read_csv(f)

column_names = [
    'fLength',   # major axis of ellipse [mm]
    'fWidth',    # minor axis of ellipse [mm] 
    'fSize',     # 10-log of sum of content of all pixels [in #phot]
    'fConc',     # ratio of sum of two highest pixels over fSize  [ratio]
    'fConc1',    # ratio of highest pixel over fSize  [ratio]
    'fAsym',     # distance from highest pixel to center, projected onto major axis [mm]
    'fM3Long',   # 3rd root of third moment along major axis  [mm] 
    'fM3Trans',  # 3rd root of third moment along minor axis  [mm]
    'fAlpha',    # angle of major axis with vector to origin [deg]
    'fDist',     # distance from origin to center of ellipse [mm]
    'class'      # g,h - gamma (signal), hadron (background)
]
df.columns = column_names

print(df.head())

Downloading the data took: 1.58s
    fLength    fWidth   fSize   fConc  fConc1     fAsym  fM3Long  fM3Trans  \
0   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
1  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
2   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
3   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   
4   51.6240   21.1502  2.9085  0.2420  0.1340   50.8761  43.1887    9.8145   

    fAlpha    fDist class  
0   6.3609  205.261     g  
1  76.9600  256.788     g  
2  10.4490  116.737     g  
3   4.6480  356.462     g  
4   3.6130  238.098     g  


Let's split data randomly into train and test sets. We will then create two tables - TELESCOPE_TRAIN and TELESCOPE_TEST - and load the datasets into these tables.

In [4]:
from sklearn.model_selection import train_test_split
import pyexasol

# Split the data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2)

column_desc = [f'{c} {("DECIMAL(18,4)" if c.startswith("f") else "CHAR(1)")}' for c in column_names]

stopwatch = Stopwatch()

# Create Exasol connection
with pyexasol.connect(dsn=EXASOL_EXTERNAL_HOST, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True) as conn:

    # Create tables
    conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.TELESCOPE_TRAIN(' + ', '.join(column_desc) + ')', query_params=params)
    conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.TELESCOPE_TEST LIKE {schema!i}.TELESCOPE_TRAIN', query_params=params)

    # Import data into Exasol
    conn.import_from_pandas(df_train, (EXASOL_SCHEMA, "TELESCOPE_TRAIN"))
    print(f"Imported {conn.last_statement().rowcount()} rows into TRAIN.")
    conn.import_from_pandas(df_test, (EXASOL_SCHEMA, "TELESCOPE_TEST"))
    print(f"Imported {conn.last_statement().rowcount()} rows into TEST.")

print(f"Importing the data took: {stopwatch}")

Imported 15215 rows into TRAIN.
Imported 3804 rows into TEST.
Importing the data took: 518.58ms
