# Learning database queries via intelligent semiotic machines

Publication information: Martins, D. M. L., Vossen, G., & de Lima Neto, F. B. (2017, November). Learning database queries via intelligent semiotic machines. In 2017 IEEE Latin American Conference on Computational Intelligence (LA-CCI) (pp. 1-6). IEEE.

URL: https://ieeexplore.ieee.org/document/8285698

Publication's BibTeX:

### Configuring notebook

In [1]:
cd ..

C:\Users\d_mart04\Documents\CodeRepositoryGit\enablingnontechsdb\experiments


In [2]:
cd ..

C:\Users\d_mart04\Documents\CodeRepositoryGit\enablingnontechsdb


In [3]:
import sys, os
sys.path.append('..\\sada')
sys.path.append('..\\decision')
sys.path.append('..\\qbe')

In [4]:
from datamanagement.dataaccessobject import DataAccessObject, Dataset
from decision.somselector import SomSelector
from sada.decisionsada import DecisionSADA
from qbe.fitfunction import PersonalizedFitnessFunction
from qbe.deapgpqbe import DEAPGeneticProgrammingQBE
from qbe import util
import pandas as pd, pandasql as pdsql
import random

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

### Loading car dataset

In [6]:
DAO = DataAccessObject()
dataset = DAO.get_car_dataset()
dataset.preprocessed_data = pd.read_pickle(os.path.join('datasets', '1993CarsPrep.pkl'))

  self.data_matrix = preprocessed_data.as_matrix()


### Configuring SADA

In [7]:
sada = DecisionSADA(dataset, selector=SomSelector(som_size=(10,10), num_iterations=1024))

In [8]:
concept = dataset.original_data.query('type=="Sporty" and Origin==0')
print(concept.index)

Int64Index([14, 19, 28, 34, 35, 60, 72, 75], dtype='int64')


In [9]:
X = dataset.preprocessed_data
y = [int(y in concept.index) for y in range(1, dataset.original_data.shape[0]+1)]

### Select an example index to start the learning process

In [10]:
example_index = 33
tuple_input = dataset.data_matrix[example_index].tolist()
print(dataset.original_data.iloc[example_index])

make                        Ford Mustang
manufacturer                        Ford
type                              Sporty
price                               7950
mpg                                 25.5
num_of_cylinders                       4
horsepower                           105
fuel_tank_capacity                  15.4
RPM                                 4600
Wheelbase                            101
Rear.seat.room                        24
Weight                              2850
automatic_gearbox                      0
passenger_capacity                     4
length                               180
width                                 68
luggage_capacity                      12
AirBags                       DriverOnly
DriveTrain                          Rear
Origin                                 0
imagepath             34ford_mustang.jpg
Name: 34, dtype: object


### Selected examples

In [11]:
selected_indexes = sada.select(query=tuple_input, num_of_selected_candidates=len(concept))
print(selected_indexes)

[18, 33, 13, 27, 56, 74, 57, 71]


In [12]:
predicted = [int(i in selected_indexes) for i in range(dataset.original_data.shape[0])]

In [13]:
precision_recall_fscore_support(y, predicted, average='binary')

(0.75, 0.75, 0.75, None)

### Query learning phase

In [14]:
fitnessfunc = PersonalizedFitnessFunction(X_train=X.values, y_train=y)
query_learner = DEAPGeneticProgrammingQBE(X, fitnessfunc)

In [24]:
best_predicate = query_learner.simple_search(population_size=128, crossover_rate=0.7, mutation_rate=0.3, num_generations=64, max_gen_without_gain=16)

gen	nevals	min  
0  	128   	10.05
1  	106   	10.05
2  	96    	6.05 
3  	93    	6.05 
4  	102   	6.05 
5  	105   	6.05 
6  	101   	6.05 
7  	105   	6.05 
8  	99    	6.05 
9  	95    	6.05 
10 	112   	6.05 
11 	109   	6.05 
12 	98    	6.05 
13 	101   	6.05 
14 	102   	6.05 
15 	102   	6.05 
16 	109   	6.05 
17 	101   	6.05 
18 	95    	6.05 
19 	95    	6.05 
20 	102   	6.05 
21 	102   	6.05 
22 	96    	6.05 
23 	100   	6.05 
24 	96    	6.05 
25 	101   	6.05 
26 	95    	6.05 
27 	97    	6.05 
28 	109   	6.05 
29 	90    	6.05 
30 	103   	6.05 
31 	94    	6.05 
32 	94    	6.05 
33 	95    	6.05 
34 	102   	6.05 
35 	106   	6.05 
36 	105   	6.05 
37 	97    	6.05 
38 	98    	6.05 
39 	100   	6.05 
40 	106   	6.05 
41 	93    	6.05 
42 	110   	6.05 
43 	93    	6.05 
44 	102   	6.05 
45 	106   	6.05 
46 	99    	6.05 
47 	98    	6.05 
48 	101   	6.05 
49 	102   	6.05 
50 	101   	6.05 
51 	104   	6.05 
52 	100   	6.05 
53 	100   	6.05 
54 	107   	6.05 
55 	95    	6.05 
56 	98    	6.05 
57 	103   	6.0

In [25]:
best_predicate

'(0.9886939145139939 <= horsepower)'

In [26]:
X.query(best_predicate)

Unnamed: 0,price,mpg,num_of_cylinders,horsepower,fuel_tank_capacity,rpm,weight,automatic_gearbox,passenger_capacity,width,luggage_capacity,origin,manufacturer0,manufacturer1,manufacturer2,manufacturer3,manufacturer4,type0,type1,type2
19,0.613893,0.4375,1.0,1.0,0.740741,0.769231,0.823386,0.0,0.25,0.948718,0.781985,0.0,0,0,1,0,1,1,0,0
28,0.416801,0.4375,0.75,1.0,0.733333,0.923077,0.926918,0.0,0.5,0.923077,0.5,0.0,0,0,1,1,1,1,0,0


In [27]:
accuracy = len(set(X.query(best_predicate).index) & set(concept.index))/len(concept.index)
accuracy

0.0625

### Show retrieved data

In [19]:
queries = ["type == 'Sporty' and Origin == 0", 
    "type != 'Sporty' and Origin == 1",
    "automatic_gearbox == 1 and horsepower >= 150",
    "luggage_capacity >= 18 and passenger_capacity > 5",
    "price <= 7000 and mpg >= 26 and automatic_gearbox == 0",
    "manufacturer == 'Ford' or manufacturer == 'Chevrolet'"]

In [20]:
results = []
for q in queries:
    concept = dataset.original_data.query(q)
    y_test = [int(y in concept.index) for y in range(1, dataset.original_data.shape[0]+1)]
    sada = DecisionSADA(dataset, selector=SomSelector(som_size=(10,10), num_iterations=dataset.original_data.shape[0]*10))
    for i in range(10):
        example_index = random.choice(concept.index)
        tuple_input = dataset.data_matrix[example_index-1].tolist()
        selected_indexes = sada.select(query=tuple_input,  num_of_selected_candidates=len(concept))
        predicted = [int(i in selected_indexes) for i in range(dataset.original_data.shape[0])]
        res = [q, len(concept.index)] + list(precision_recall_fscore_support(y_test, predicted, average='binary'))[:3]
        results.append(res)

In [21]:
df = pd.DataFrame(results, columns=['query', 'concept size', 'precision', 'recall', 'f1score'])

In [22]:
df.groupby(by=['query', 'concept size']).mean().sort_values(by='f1score', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1score
query,concept size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
luggage_capacity >= 18 and passenger_capacity > 5,9,0.5,0.5,0.5
price <= 7000 and mpg >= 26 and automatic_gearbox == 0,25,0.492,0.492,0.492
type == 'Sporty' and Origin == 0,8,0.4625,0.4625,0.4625
type != 'Sporty' and Origin == 1,39,0.425641,0.425641,0.425641
automatic_gearbox == 1 and horsepower >= 150,23,0.404348,0.404348,0.404348
manufacturer == 'Ford' or manufacturer == 'Chevrolet',16,0.2125,0.2125,0.2125
