In [None]:
cd ..

In [None]:
import sys, os
sys.path.append('..\\PhDCoding\\qbe')

In [None]:
import random
import numpy as np, pandas as pd, pandasql as pdsql, sklearn.datasets as ds
import util
from qbe.greedyqbe import GreedySearchQBE
from qbe.fitfunction import QueryDiscoveryFitnessFunction
from qbe.deapgpqbe import DEAPGeneticProgrammingQBE
from qbe.treeqbe import DecisionTreeQBE

In [None]:
def get_results(predicate, dataframe, desired_set, undesired_set):
    best_data_view = pysql("SELECT * FROM dataframe WHERE " + predicate)
    actual_set = util.convert_nparray_to_set(best_data_view)
    recall = util.get_recall(actual_set, desired_set)
    specificity = util.get_specificity(actual_set, desired_set, undesired_set)
    length = 1 + predicate.split().count("AND") + predicate.split().count("OR")
    return {'Recall': recall, 'Specificity': specificity, 'Length': length, 'Query': predicate}

In [None]:
def write_file_results(entry, filename):
    with open('C:\\Users\\d_mart04.WIWI\\Downloads\\' + filename, 'a') as rfile:
        rfile.write('{0};{1};{2};{3}'.format(entry['Recall'], entry['Specificity'], entry['Length'], entry['Query']))
        rfile.write('\n')

In [None]:
# Loading the Iris dataset https://archive.ics.uci.edu/ml/datasets/iris
iris = ds.load_iris()

dataframe = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

dataframe.columns = ["sepal_len", "sepal_wid", "petal_len", "petal_wid", "target"]

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

### Experiment 1: Discovering unknown queries

In [None]:
def unknown_queries_discovery(dataframe, tuples_count):
    filename = '_unkq_selectivity_{0}.txt'.format(tuples_count)
    
    size_desired_set = random.randint(1, tuples_count)
    indices = [random.randint(0, dataframe.shape[0]-1) for x in range(size_desired_set)]
    desired_output = dataframe.loc[indices]

    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set    
    queryfitfunction = QueryDiscoveryFitnessFunction(dataframe, indices)

    greedy_search = GreedySearchQBE(queryfitfunction, dataframe)
    best_greedy = greedy_search.search_best_predicate(max_iterations=100, threshold=0.001, verbose=False)
    greedy = get_results(best_greedy, dataframe, desired_set, undesired_set)

    write_file_results(greedy, 'greedy' + filename)

    decision_tree = DecisionTreeQBE(dataframe, indices)
    best_tree = decision_tree.search_best_predicate()
    tree = get_results(best_tree, dataframe, desired_set, undesired_set)

    write_file_results(tree, 'tree' + filename)

    genetic_programming = DEAPGeneticProgrammingQBE(dataframe, queryfitfunction)
    best_genetic = genetic_programming.simple_search(population_size=200, crossover_rate=0.9, 
                                                            mutation_rate=0.5, num_generations=100, 
                                                            max_gen_without_gain=30, verbose=False)
    gp = get_results(best_genetic, dataframe, desired_set, undesired_set)

    write_file_results(gp, 'gp' + filename)

In [None]:
for experiment in range(10):
    #unknown_queries_discovery(dataframe, tuples_count=1)
    unknown_queries_discovery(dataframe, tuples_count=15)
    #unknown_queries_discovery(dataframe, tuples_count=50)
    #unknown_queries_discovery(dataframe, tuples_count=100)
    
print('Finished')

### Experiment 2: Discovering alternative queries

In [None]:
def alternative_query_discovery(dataframe, index_list):
    selectivity = len(index_list)
    filename = '_altq_selectivity_{0}.txt'.format(selectivity)

    desired_output = dataframe.loc[index_list]
    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set

    fitness_function = QueryDiscoveryFitnessFunction(dataframe, index_list)
    
    algorithm = GreedySearchQBE(fitness_function, dataframe)
    best_predicate = algorithm.search_best_predicate(max_iterations=100, threshold=0.001, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    write_file_results(results, 'greedy' + filename)

    algorithm = DecisionTreeQBE(dataframe, index_list)
    best_predicate = algorithm.search_best_predicate()
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    write_file_results(results, 'tree' + filename)

    algorithm = DEAPGeneticProgrammingQBE(dataframe, fitness_function)
    best_predicate = algorithm.simple_search(population_size=200, crossover_rate=0.90, 
                                                            mutation_rate=0.50, num_generations=100, 
                                                            max_gen_without_gain=30, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    write_file_results(results, 'gp' + filename)

In [None]:
original_queries = ["SELECT * FROM dataframe WHERE petal_len >= 4.0 AND petal_wid = 1.8 AND target = 1",
                    "SELECT * FROM dataframe WHERE sepal_len > 6.5 AND petal_wid >= 1.3 AND petal_wid <= 1.9",
                    "SELECT * FROM dataframe WHERE sepal_wid <> 3.0 AND petal_len <> 1.4"]

In [None]:
for query in original_queries:
    view = pysql(query)
    indices = util.get_original_indexes_from_view(dataframe, view)
            
    for i in range(10):
        alternative_query_discovery(dataframe, indices)

print('Finished')

### Experiment 3: Classification Rule

In [None]:
def discover_without_target_class(dataframe, index_list):
    selectivity = len(index_list)
    filename = '_altq_selectivity_100_15.txt'

    desired_output = dataframe.loc[index_list]
    total_set = util.convert_nparray_to_set(dataframe)
    desired_set = util.convert_nparray_to_set(desired_output)
    undesired_set = total_set - desired_set

    fitness_function = QueryDiscoveryFitnessFunction(dataframe, index_list)
    
    algorithm = GreedySearchQBE(fitness_function, dataframe)
    best_predicate = algorithm.search_best_predicate(max_iterations=100, threshold=0.001, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)
    
    write_file_results(results, 'greedy' + filename)
    
    algorithm = DecisionTreeQBE(dataframe, index_list)
    best_predicate = algorithm.search_best_predicate()
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    write_file_results(results, 'tree' + filename)

    algorithm = DEAPGeneticProgrammingQBE(dataframe, fitness_function)
    best_predicate = algorithm.simple_search(population_size=200, crossover_rate=0.90, 
                                                            mutation_rate=0.50, num_generations=100, 
                                                            max_gen_without_gain=30, verbose=False)
    results = get_results(best_predicate, dataframe, desired_set, undesired_set)

    write_file_results(results, 'gp' + filename)

In [None]:
dataframe = pd.DataFrame(data= iris['data'], columns= iris['feature_names'])

dataframe.columns = ["sepal_len", "sepal_wid", "petal_len", "petal_wid"]

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

indices = [x for x in range(50, 100)]
[discover_without_target_class(dataframe, indices) for i in range(10)]

### Experiment CAR dataset

In [None]:
from datamanagement.dataaccessobject import DataAccessObject, Dataset

In [None]:
DAO = DataAccessObject()
dataset = DAO.get_car_dataset()

In [None]:
dataframe = dataset.preprocessed_data
dataframe.columns = ['Price', 'MPG', 'NumOfCylinders', 'horsepower', 'FuelTankCapacity',
       'RPM', 'Wheelbase', 'RearSeatRoom', 'Weight', 'AutomaticGearbox',
       'PassengerCapacity', 'length', 'width', 'LuggageCapacity', 'Origin',
       'Compact', 'Large', 'Midsize', 'Small', 'Sporty', 'Van', 'Acura',
       'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge',
       'Eagle', 'Ford', 'Geo', 'Honda', 'Hyundai', 'Infiniti', 'Lexus',
       'Lincoln', 'Mazda', 'MercedesBenz', 'Mercury', 'Mitsubishi', 'Nissan',
       'Oldsmobile', 'Plymouth', 'Pontiac', 'Saab', 'Saturn', 'Subaru',
       'Suzuki', 'Toyota', 'Volkswagen', 'Volvo', '4WD', 'Front', 'Rear',
       'DriverPassenger', 'DriverOnly', 'NoAirBags']

# Configure PandaSQL to query Pandas dataframe
pysql = lambda q: pdsql.sqldf(q, globals())

In [None]:
[unknown_queries_discovery(dataframe, tuples_count=25) for x in range(10)]
print('Finished')