# Learning database queries via intelligent semiotic machines

Publication information: Martins, D. M. L., Vossen, G., & de Lima Neto, F. B. (2017, November). Learning database queries via intelligent semiotic machines. In 2017 IEEE Latin American Conference on Computational Intelligence (LA-CCI) (pp. 1-6). IEEE.

URL: https://ieeexplore.ieee.org/document/8285698

Publication's BibTeX:

```
@INPROCEEDINGS{8285698,
    author={D. M. L. {Martins} and G. {Vossen} and F. B. {de Lima Neto}},
    booktitle={2017 IEEE Latin American Conference on Computational Intelligence (LA-CCI)},
    title={Learning database queries via intelligent semiotic machines},
    year={2017},
    volume={},
    number={},
    pages={1-6},
    keywords={Big Data;database management systems;information retrieval;learning (artificial intelligence);query languages;query processing;relational databases;SQL;SQL queries;database-specific knowledge;query language;database schema;intelligent semiotic machines;Big Data era;data-driven approaches;query criteria;hard constraints;information overload;Semiotics;Computational Intelligence techniques;tailored queries;data exploration;database search;data retrieval;database queries learning;information finding;Databases;Semiotics;Automobiles;Self-organizing feature maps;Neurons;Training;Electronic mail},
    doi={10.1109/LA-CCI.2017.8285698},
    ISSN={null},
    month={Nov},
}
```

### Configuring notebook

In [None]:
cd ..

In [None]:
cd ..

In [None]:
import sys, os
sys.path.append('..\\sada')
sys.path.append('..\\decision')
sys.path.append('..\\qbe')

In [None]:
from datamanagement.dataaccessobject import DataAccessObject, Dataset
from decision.somselector import SomSelector
from sada.decisionsada import DecisionSADA
from qbe.fitfunction import PersonalizedFitnessFunction
from qbe.deapgpqbe import DEAPGeneticProgrammingQBE
from qbe import util
import pandas as pd, pandasql as pdsql
import random

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

### Loading car dataset

In [None]:
DAO = DataAccessObject()
dataset = DAO.get_car_dataset()
dataset.preprocessed_data = pd.read_pickle(os.path.join('datasets', '1993CarsPrep.pkl'))

### Configuring SADA

In [None]:
sada = DecisionSADA(dataset, selector=SomSelector(som_size=(10,10), num_iterations=1024))

In [None]:
concept = dataset.original_data.query('type=="Sporty" and Origin==0')
print(concept.index)

In [None]:
X = dataset.preprocessed_data
y = [int(y in concept.index) for y in range(1, dataset.original_data.shape[0]+1)]

### Select an example index to start the learning process

In [None]:
example_index = 33
tuple_input = dataset.data_matrix[example_index].tolist()
print(dataset.original_data.iloc[example_index])

### Selected examples

In [None]:
selected_indexes = sada.select(query=tuple_input, num_of_selected_candidates=len(concept))
print(selected_indexes)

In [None]:
predicted = [int(i in selected_indexes) for i in range(dataset.original_data.shape[0])]

### Evaluating the quality of the SOM-based selection of Positive/Negative examples

In [None]:
precision_recall_fscore_support(y, predicted, average='binary')

### Query learning phase

In [None]:
fitnessfunc = PersonalizedFitnessFunction(X_train=X.values, y_train=predicted)
query_learner = DEAPGeneticProgrammingQBE(X, fitnessfunc)

In [None]:
best_predicate = query_learner.simple_search(population_size=256, crossover_rate=0.9, mutation_rate=0.3, num_generations=50, max_gen_without_gain=16)

In [None]:
best_predicate

In [None]:
X.query(best_predicate)

In [None]:
accuracy = len(set(X.query(best_predicate).index) & set(concept.index))/len(concept.index)
accuracy

### Show retrieved data

In [None]:
queries = ["type == 'Sporty' and Origin == 0", 
    "type != 'Sporty' and Origin == 1",
    "automatic_gearbox == 1 and horsepower >= 150",
    "luggage_capacity >= 18 and passenger_capacity > 5",
    "price <= 7000 and mpg >= 26 and automatic_gearbox == 0",
    "manufacturer == 'Ford' or manufacturer == 'Chevrolet'"]

In [None]:
results = []
for q in queries:
    concept = dataset.original_data.query(q)
    y_test = [int(y in concept.index) for y in range(1, dataset.original_data.shape[0]+1)]
    sada = DecisionSADA(dataset, selector=SomSelector(som_size=(10,10), num_iterations=dataset.original_data.shape[0]*10))
    for i in range(10):
        example_index = random.choice(concept.index)
        tuple_input = dataset.data_matrix[example_index-1].tolist()
        selected_indexes = sada.select(query=tuple_input,  num_of_selected_candidates=len(concept))
        predicted = [int(i in selected_indexes) for i in range(dataset.original_data.shape[0])]
        res = [q, len(concept.index)] + list(precision_recall_fscore_support(y_test, predicted, average='binary'))[:3]
        results.append(res)

In [None]:
df = pd.DataFrame(results, columns=['query', 'concept size', 'precision', 'recall', 'f1score'])

In [None]:
df.groupby(by=['query', 'concept size']).mean().sort_values(by='f1score', ascending=False)