# Handling Examples Outside the Database 

In [None]:
cd ..

In [None]:
cd ..

In [None]:
import pandas as pd
import numpy as np
import os
import random
import math
import operator

from minisom import MiniSom
from scipy.spatial import distance as spd
from sklearn.metrics import precision_recall_fscore_support

In [None]:
def get_pos_neg(query, data, som, num_selected_items):
    """Select indexes based on their proximity in the map"""    
    item_relevance_mapping = {}
    winner = som.winner(query)
    for index in range(len(data)):
        elem = data[index]
        w = som.winner(elem)
        distance = spd.cityblock(list(winner), list(w))
        item_relevance_mapping[index+1] = distance
    
    sorted_candidates = sorted(item_relevance_mapping.items(), key=operator.itemgetter(1))
    positives = [x[0] for x in sorted_candidates[:num_selected_items]]

    negatives = []
    for j in range(1,num_selected_items+1):
        negatives.append(sorted_candidates[-j][0])

    return positives, negatives

In [None]:
def experiment(name_data, original_data, preprocessed_data, queries, nexperiments):
    results = []
    for query_id in range(len(queries)):
        data = preprocessed_data.values
        concept = original_data.query(queries[query_id]).index.to_list()

        num_neurons = 5*(data.shape[0]**0.543)
        x_size = int(num_neurons**0.5) +1
        y_size = int(num_neurons**0.5) +1

        learning_rate = 0.8
        sigma = max(x_size, y_size)*0.5

        som = MiniSom(x_size, y_size, data.shape[1], 
                    sigma=sigma, learning_rate=learning_rate, 
                    neighborhood_function='gaussian')

        training_iterations = 1000

        for fac in [1.0, 1.5, 2.0]:
            for it in range(nexperiments):
                e = random.choice(concept)
                used_ids = [i for i in preprocessed_data.index if i != e]

                labels = [int(x in concept) for x in range(1, preprocessed_data.shape[0]+1)]

                query = data[int(e)-1]

                data = preprocessed_data.loc[used_ids].values
                y_train = [int(x in concept) for x in used_ids]

                som.train_random(data, training_iterations, verbose=False)

                pos, neg = get_pos_neg(query, data, som, int(len(concept)*fac))

                predicted = [int(i in pos) for i in preprocessed_data.index]
                y_test = labels

                scores = precision_recall_fscore_support(y_test, predicted, average='binary')
                report = list(scores[:3]) + [query_id, fac, 'SOM']

                results.append(report)

    df = pd.DataFrame(data=results, columns=['precision', 'recall', 'f1score', 'query_id', 'selfactor', 'estimator'])
    return df

In [None]:
cartable = pd.read_pickle(os.path.join('datasets', 'car_original_dataset.pkl'))
cartable.columns = [str.lower(col.replace('.', '_')) for col in cartable.columns]
cartable['origin'] = cartable['origin'].map({0: False, 1: True})
cartable['automatic_gearbox'] = cartable['automatic_gearbox'].map({0: False, 1: True})

preprocessed_data = pd.read_pickle(os.path.join('datasets', '1993CarsPrep.pkl'))

queries = [
    "type != 'Sporty' and origin == 1",
    "automatic_gearbox == 1 and horsepower >= 150",
    "price <= 7000 and mpg >= 26 and automatic_gearbox == 0",
    "manufacturer == 'Ford' or manufacturer == 'Chevrolet'"
]

In [None]:
result_exp = experiment('1993Cars', cartable, preprocessed_data, queries, nexperiments=10)

In [None]:
result_exp.groupby(['query_id','selfactor','estimator']).mean()[['f1score', 'precision', 'recall']]