In [1]:
cd ..

D:\Google Drive\Doutorado\CodeRepository\phd-query-synthesis-git-lab\PhDCoding


In [2]:
import sys, os
sys.path.append('..\\PhDCoding\\qbe')

In [3]:
import random
import numpy as np, pandas as pd, pandasql as pdsql, sklearn.datasets as ds
import util
from qbe.greedyqbe import GreedySearchQBE
from qbe.fitfunction import FitnessFunction
from qbe.geneticqbe import GeneticProgrammingQBE
from qbe.astarqbe import AstarQBE
from qbe.treeqbe import DecisionTreeQBE

In [4]:
def get_results(predicate, dataframe, desired_set, undesired_set):
    best_data_view = pysql("SELECT * FROM dataframe WHERE " + predicate)
    actual_set = util.convert_nparray_to_set(best_data_view)
    recall = util.get_recall(actual_set, desired_set)
    specificity = util.get_specificity(actual_set, desired_set, undesired_set)
    length = 1 + predicate.split().count("AND") + predicate.split().count("OR")
    return {'Recall': recall, 'Specificity': specificity, 'Length': length, 'Query': predicate}

In [5]:
def write_file_results(entry, filename):
    with open('C:\\Users\\d_mart04.WIWI\\Downloads\\' + filename, 'a') as rfile:
        rfile.write('{0};{1};{2};{3}'.format(entry['Recall'], entry['Specificity'], entry['Length'], entry['Query']))
        rfile.write('\n')

In [6]:
# Loading the Iris dataset https://archive.ics.uci.edu/ml/datasets/iris
iris = ds.load_iris()

dataframe = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

dataframe.columns = ["sepal_len", "sepal_wid", "petal_len", "petal_wid", "target"]

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

### Experiment 1: Discovering unknown queries

In [6]:
def unknown_queries_discovery(dataframe, tuples_count):
    filename = '_unkq_selectivity_{0}.txt'.format(tuples_count)
    
    size_desired_set = random.randint(1, tuples_count)
    indices = [random.randint(0, dataframe.shape[0]-1) for x in range(size_desired_set)]
    desired_output = dataframe.loc[indices]

    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set    
    fitness_function = FitnessFunction(desired_set, undesired_set, pysql)

    greedy_search = GreedySearchQBE(fitness_function, dataframe)
    best_greedy = greedy_search.search_best_predicate(max_iterations=100, threshold=0.001)
    greedy = get_results(best_greedy, dataframe, desired_set, undesired_set)

    write_file_results(greedy, 'greedy' + filename)

    decision_tree = DecisionTreeQBE(dataframe, indices)
    best_tree = decision_tree.search_best_predicate()
    tree = get_results(best_tree, dataframe, desired_set, undesired_set)

    write_file_results(tree, 'tree' + filename)

    genetic_programming = GeneticProgrammingQBE(dataframe, fitness_function)
    best_genetic = genetic_programming.simple_search(population_size=200, crossover_rate=0.90, 
                                                            mutation_rate=0.20, num_generations=100, 
                                                            max_gen_without_gain=30, verbose=False)
    gp = get_results(best_genetic, dataframe, desired_set, undesired_set)

    write_file_results(gp, 'gp' + filename)

In [10]:
for experiment in range(10):
    unknown_queries_discovery(dataframe, tuples_count=1)
    unknown_queries_discovery(dataframe, tuples_count=15)
    unknown_queries_discovery(dataframe, tuples_count=50)
    unknown_queries_discovery(dataframe, tuples_count=100)
    
print('Finished')    

KeyboardInterrupt: 

### Experiment 2: Discovering alternative queries

In [7]:
def alternative_query_discovery(dataframe, index_list):
    selectivity = len(index_list)
    filename = '_altq_selectivity_{0}.txt'.format(selectivity)

    desired_output = dataframe.loc[index_list]
    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set

    fitness_function = FitnessFunction(desired_set, undesired_set, pysql)
    
    #algorithm = GreedySearchQBE(fitness_function, dataframe)
    #best_predicate = algorithm.search_best_predicate(max_iterations=100, threshold=0.001)
    #results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    #write_file_results(results, 'greedy' + filename)

    #algorithm = DecisionTreeQBE(dataframe, index_list)
    #best_predicate = algorithm.search_best_predicate()
    #results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    #write_file_results(results, 'tree' + filename)

    algorithm = GeneticProgrammingQBE(dataframe, fitness_function)
    best_predicate = algorithm.simple_search(population_size=200, crossover_rate=0.80, 
                                                            mutation_rate=0.20, num_generations=100, 
                                                            max_gen_without_gain=20, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    write_file_results(results, 'gp' + filename)

In [8]:
original_queries = ["SELECT * FROM dataframe WHERE petal_len >= 4.0 AND petal_wid = 1.8 AND target = 1",
                    "SELECT * FROM dataframe WHERE sepal_len > 6.5 AND petal_wid >= 1.3 AND petal_wid <= 1.9",
                    "SELECT * FROM dataframe WHERE sepal_wid <> 3.0 AND petal_len <> 1.4"]

In [9]:
for query in original_queries:
    view = pysql(query)
    indices = util.get_original_indexes_from_view(dataframe, view)
            
    for i in range(10):
        alternative_query_discovery(dataframe, indices)

print('Finished')

Finished


### Experiment 3: Classification Rule

In [None]:
def discover_without_target_class(dataframe, index_list):
    selectivity = len(index_list)
    filename = '_altq_selectivity_100_15.txt'

    desired_output = dataframe.loc[index_list]
    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set

    fitness_function = FitnessFunction(desired_set, undesired_set, pysql)
    
    algorithm = GreedySearchQBE(fitness_function, dataframe)
    best_predicate = algorithm.search_best_predicate(max_iterations=100, threshold=0.001)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    write_file_results(results, 'greedy' + filename)
    
    algorithm = DecisionTreeQBE(dataframe, index_list)
    best_predicate = algorithm.search_best_predicate()
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    write_file_results(results, 'tree' + filename)

    algorithm = GeneticProgrammingQBE(dataframe, fitness_function)
    best_predicate = algorithm.simple_search(population_size=200, crossover_rate=0.80, 
                                                            mutation_rate=0.20, num_generations=100, 
                                                            max_gen_without_gain=20, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    write_file_results(results, 'gp' + filename)

In [None]:
dataframe = pd.DataFrame(data= iris['data'], columns= iris['feature_names'])

dataframe.columns = ["sepal_len", "sepal_wid", "petal_len", "petal_wid"]

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

indices = [x for x in range(100, 150)]
[discover_without_target_class(dataframe, indices) for i in range(10)]

In [None]:
view = pysql("SELECT * FROM dataframe")
view

### Experiment CAR dataset

In [7]:
from datamanagement.dataaccessobject import DataAccessObject, Dataset

In [8]:
DAO = DataAccessObject()
dataset = DAO.get_car_dataset()

In [9]:
dataframe = dataset.preprocessed_data
dataframe.columns = ['Price', 'MPG', 'NumOfCylinders', 'horsepower', 'FuelTankCapacity',
       'RPM', 'Wheelbase', 'RearSeatRoom', 'Weight', 'AutomaticGearbox',
       'PassengerCapacity', 'length', 'width', 'LuggageCapacity', 'Origin',
       'Compact', 'Large', 'Midsize', 'Small', 'Sporty', 'Van', 'Acura',
       'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge',
       'Eagle', 'Ford', 'Geo', 'Honda', 'Hyundai', 'Infiniti', 'Lexus',
       'Lincoln', 'Mazda', 'MercedesBenz', 'Mercury', 'Mitsubishi', 'Nissan',
       'Oldsmobile', 'Plymouth', 'Pontiac', 'Saab', 'Saturn', 'Subaru',
       'Suzuki', 'Toyota', 'Volkswagen', 'Volvo', '4WD', 'Front', 'Rear',
       'DriverPassenger', 'DriverOnly', 'NoAirBags']

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

In [10]:
[unknown_queries_discovery(dataframe, tuples_count=25) for x in range(10)]
print('Finished')

------------------------------
Iteration 	 Best fitness
1 		 51.6854
2 		 35.9551
3 		 24.7191
4 		 16.8539
5 		 12.3596
6 		 8.9888
7 		 6.7416
8 		 4.4944
9 		 2.2472
10 		 1.1236
11 		 0.0000
-------- Search ended --------
------------------------------
Iteration 	 Best fitness
1 		 87.6712
2 		 76.7123
3 		 65.7534
4 		 57.5342
5 		 50.6849
6 		 45.2055
7 		 41.0959
8 		 36.9863
9 		 32.8767
10 		 28.7671
11 		 24.6575
12 		 20.5479
13 		 17.8082
14 		 15.0685
15 		 12.3288
16 		 9.5890
17 		 8.2192
18 		 6.8493
19 		 5.4795
20 		 4.1096
21 		 2.7397
22 		 1.3699
23 		 0.0000
-------- Search ended --------
------------------------------
Iteration 	 Best fitness
1 		 47.1264
2 		 25.2874
3 		 14.9425
4 		 10.3448
5 		 6.8966
6 		 3.4483
7 		 1.1494
8 		 0.0000
-------- Search ended --------
------------------------------
Iteration 	 Best fitness
1 		 60.9195
2 		 48.2759
3 		 36.7816
4 		 27.5862
5 		 20.6897
6 		 16.0920
7 		 12.6437
8 		 10.3448
9 		 8.0460
10 		 5.7471
11 		 3.44