In [1]:
import util, random
import numpy as np, pandas as pd, pandasql as pdsql
import sklearn.datasets as ds
from greedyqbe import GreedySearchQBE
from fitfunction import FitnessFunction
from astarqbe import AstarQBE

In [2]:
def get_view(predicate):
    query = "SELECT * FROM dataframe WHERE " + predicate
    print('Query: ', query)
    print('---------------------------------')
    best_data_view = pysql(query)
    util.get_information_retrieval_metrics(dataframe, desired_output, best_data_view)
    print('---------------------------------')
    return best_data_view

def get_desired_indexes(dataframe, desired_output):
    indexes = []
    index = 0
    for row in dataframe.values:
        for out in desired_output.values: 
            if (row == out).all():
                indexes.append(index)
        index += 1
    return indexes

#### Loading the Iris dataset https://archive.ics.uci.edu/ml/datasets/iris

In [3]:
iris = ds.load_iris()
dataframe = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
dataframe.columns = ["sepal_len", "sepal_wid", "petal_len", "petal_wid", "target"]

#### Configure PandaSQL to query Pandas dataframe

In [4]:
pysql = lambda q: pdsql.sqldf(q, globals())

#### Create the view for the query reverse engineering problem

In [5]:
#desired_output = pysql("SELECT * FROM dataframe WHERE sepal_len > 6.5 AND petal_wid > 1.3 AND petal_wid < 1.9")
#desired_indexes = get_desired_indexes(dataframe, desired_output)
size_desired_set = random.randint(0, 20)
desired_indexes = [random.randint(0, dataframe.shape[0]-1) for x in range(size_desired_set)]
desired_output = dataframe.loc[desired_indexes]
desired_output

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,target
67,5.8,2.7,4.1,1.0,1.0
48,5.3,3.7,1.5,0.2,0.0
109,7.2,3.6,6.1,2.5,2.0
127,6.1,3.0,4.9,1.8,2.0
39,5.1,3.4,1.5,0.2,0.0
53,5.5,2.3,4.0,1.3,1.0
115,6.4,3.2,5.3,2.3,2.0
85,6.0,3.4,4.5,1.6,1.0
59,5.2,2.7,3.9,1.4,1.0
51,6.4,3.2,4.5,1.5,1.0


#### Generate sets to facilitate computation

In [6]:
total_set = util.convert_nparray_to_set(dataframe)
desired_set = util.convert_nparray_to_set(desired_output)
undesired_set = total_set - desired_set

#### Fitness Function

In [7]:
fitness_function = FitnessFunction(desired_set, undesired_set, pysql)

#### Greedy Search

In [None]:
greedy_search = GreedySearchQBE(fitness_function, dataframe)

In [None]:
%%time
best_greedy = greedy_search.search_best_predicate(max_iterations=100, threshold=0.001)

------------------------------
Iteration 	 Best fitness
1 		 77.2727
2 		 66.6667
3 		 60.6061
4 		 55.3030
5 		 50.7576
6 		 46.9697
7 		 43.9394


In [None]:
get_view(best_greedy)

#### Decision Tree

In [None]:
from treeqbe import DecisionTreeQBE

In [None]:
decision_tree = DecisionTreeQBE(dataframe, desired_indexes)

In [None]:
%%time
best_tree = decision_tree.search_best_predicate()

In [None]:
get_view(best_tree)

#### Genetic Programming

In [None]:
from geneticqbe import GeneticProgrammingQBE

In [None]:
genetic_programming = GeneticProgrammingQBE(dataframe, fitness_function)

In [None]:
%%time
best_genetic = genetic_programming.search_best_predicate(population_size=300, crossover_rate=0.9, mutation_rate=0.3, 
                                                         num_generations=100, max_gen_without_gain=20)

In [None]:
get_view(best_genetic)

#### A* Search

In [None]:
astar = AstarQBE(fitness_function.calculate)

In [None]:
%%time
best_astar = astar.search_best_predicate(dataframe, max_iterations=100)

In [None]:
get_view(best_astar)