# Duck iris prediction

In [1]:
%load_ext magic_duckdb

In [2]:
import duckdb

# Open a connection to your DuckDB database
con = duckdb.connect("data/my-data.duckdb")

In [3]:
%%dql -co con
-- get raw training data
CREATE OR REPLACE TABLE prepared_data AS
SELECT * FROM duck_iris;

-- create target mapping
CREATE OR REPLACE TABLE target_mapping AS
SELECT species,
       ROW_NUMBER() OVER (ORDER BY species) - 1 AS target
FROM (SELECT DISTINCT species FROM prepared_data);

-- create ml features
CREATE OR REPLACE TABLE ml_data AS
SELECT
    t.target as y,
    [p.beak_width, p.beak_length, p.wing_length, p.tail_length, p.tarsus_length] as X
FROM prepared_data p
JOIN target_mapping t ON p.species = t.species;

Unnamed: 0,Count
0,48


# SQL Sampling

In [4]:
%%dql -co con
ALTER TABLE ml_data ADD COLUMN IF NOT EXISTS random_value DOUBLE;
UPDATE ml_data SET random_value = random();

Unnamed: 0,Count
0,48


In [5]:
%%dql -co con
CREATE OR REPLACE TABLE train_data AS
WITH stratified_data AS (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY y ORDER BY random_value) AS row_num,
           COUNT(*) OVER (PARTITION BY y) AS class_count
    FROM ml_data
)
SELECT
    y, X
FROM stratified_data
WHERE row_num <= CEIL(0.65 * class_count);

Unnamed: 0,Count
0,33


In [6]:
%%dql -co con
from prepared_data

Unnamed: 0,species,beak_width,beak_length,wing_length,tail_length,tarsus_length
0,Egyptian goose,22.2,53.0,403.0,140.0,86.5
1,Egyptian goose,21.7,45.2,392.0,148.0,75.3
2,Egyptian goose,19.1,45.2,360.0,113.0,70.0
3,Egyptian goose,20.5,48.0,350.0,112.0,73.5
4,Egyptian goose,20.3,62.7,344.0,138.0,75.4
5,Egyptian goose,20.7,55.5,287.0,122.0,74.1
6,Laysan duck,13.8,43.0,204.0,98.0,34.4
7,Laysan duck,14.6,46.0,213.0,72.0,35.6
8,Laysan duck,14.4,44.0,198.0,70.0,34.5
9,Laysan duck,12.7,41.1,184.0,94.0,33.6


In [7]:
%%dql -co con
CREATE OR REPLACE TABLE test_data AS
SELECT
    y, X
FROM ml_data
EXCEPT
SELECT
    y, X
FROM train_data;

Unnamed: 0,Count
0,15


In [8]:
df = con.execute("""SELECT * FROM train_data ORDER BY y""").fetchnumpy()
X = df['X'].tolist()
y = df['y'].tolist()

In [9]:
# sklearn training
from sklearn import tree
classifier=tree.DecisionTreeClassifier()
classifier.fit(X,y)

# Register sklearn model in Python UDF
def predict_duck(X):
    pred = classifier.predict([X])[0]
    return int(pred)

con.create_function("predict_duck", predict_duck, ['DOUBLE[]'], 'INTEGER')

  con.create_function("predict_duck", predict_duck, ['DOUBLE[]'], 'INTEGER')


<duckdb.duckdb.DuckDBPyConnection at 0x1186af970>

In [10]:
%%dql -co con
FROM test_data ORDER BY y

Unnamed: 0,y,X
0,0,"[21.7, 45.2, 392.0, 148.0, 75.3]"
1,0,"[20.5, 48.0, 350.0, 112.0, 73.5]"
2,1,"[29.1, 56.0, 263.0, 88.0, 58.0]"
3,2,"[11.4, 26.6, 185.0, 78.0, 31.5]"
4,2,"[11.0, 25.9, 187.0, 80.0, 30.6]"
5,3,"[17.6, 33.1, 268.0, 75.0, 41.0]"
6,4,"[13.8, 43.0, 204.0, 98.0, 34.4]"
7,4,"[11.9, 37.7, 185.0, 89.0, 33.6]"
8,5,"[20.8, 57.1, 305.0, 124.0, 47.8]"
9,5,"[20.6, 60.3, 329.0, 141.0, 48.1]"


In [11]:
%%dql -co con
-- make predictions on test data
CREATE OR REPLACE TABLE predictions AS SELECT y, predict_duck(X) as y_pred FROM test_data

Unnamed: 0,Count
0,15


In [12]:
%%dql -co con
-- evaluate model on test sample
SELECT
    COUNT(*) AS total_predictions,
    SUM(CASE WHEN y = y_pred THEN 1 ELSE 0 END) AS correct_predictions,
    CAST(SUM(CASE WHEN y = y_pred THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) AS accuracy
FROM predictions;

Unnamed: 0,total_predictions,correct_predictions,accuracy
0,15,11.0,0.733333


In [13]:
con.close()