# Som-based strategy

### Configuring notebook

In [None]:
cd ..

In [None]:
cd ..

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from minisom import MiniSom

### Define functions for SOM

In [None]:
def create_som(data, x_size, y_size):
    """Create SOM using the MiniSom library. Default parameters: learning rate is 0.1 and sigma is half of the highest dimension"""
    
    learning_rate = 0.1
    sigma = max(x_size, y_size)*0.5

    som = MiniSom(x_size, y_size, data.shape[1], 
                sigma=sigma, learning_rate=learning_rate, 
                neighborhood_function='gaussian')

    return som

In [None]:
def som_classification(som, data, class_assignments):
    """Classify examples as positive if they are close to positive examples in the topological map"""
    prediction = []
    for d in data:
        winner = som.winner(d)
        if isinstance(class_assignments[winner], list):
            prediction.append(0)
        else:
            predicted = list(class_assignments[winner].keys())
            prediction.append(int(1 in predicted))
    
    return prediction

### Experiment setup
This experiment takes an selectivity factor $f \in [0.2, 0.5, 0.8]$ representing the percentage of data items the user provides in the examples. For instance, if the complete set of examples consists of $10$ data items, a selectivity factor $f = 0.2$ represents the case where the user provides only $2$ data items as examples.

In this sense, the idea of this experiment is to verify whether the SOM technique can automatically select additional data items that are similar to the user-provided examples. Back to the previous example, we check whether the SOM could select data items in the complete set of examples composed by $10$ data items.

#### SOM parameters
In this experiment, we create a squared SOM with paramenters defined as follows:
- Number of units (i.e., neurons): We use the rule described in [Documentation of the MATLAB SOM TOOLBOX](http://www.cis.hut.fi/projects/somtoolbox/documentation/somalg.shtml), that is, $M = 5\cdot\sqrt(N)$, where N is the number of training data instances.
- Learning rate: 0.1
- Sigma: Half of the highest dimension, that is, $\sigma = max(x_{size}, y_{size})\cdot 0.5$

In [None]:
def experiment(name_data, original_data, preprocessed_data, queries, nexperiments):
    data = preprocessed_data.values
    result = []
    for query_id in range(len(queries)):
        concept = original_data.query(queries[query_id]).index.to_list()
        labels = [int(x in concept) for x in range(1, data.shape[0]+1)]

        for factor_ex in [0.2, 0.5, 0.8]:
            for i in range(nexperiments):
                # Splitting traininig and test data
                X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=factor_ex, stratify=labels)
                
                # Automatically selecting SOM size
                num_neurons = 5*(data.shape[0]**0.543)
                x_size = int(num_neurons**0.5) +1
                y_size = int(num_neurons**0.5) +1
                
                som = create_som(data, x_size, y_size)
                
                # Initializing SOM weights
                training_iterations = 1000
                som.train_random(data, training_iterations, verbose=False)
                
                class_assignments = som.labels_map(X_train, y_train)
                predicted = som_classification(som, X_test, class_assignments)
                
                # Evaluating results
                scores = precision_recall_fscore_support(y_test, predicted, average='binary')
                report = list(scores[:3]) + ['SOM', query_id, factor_ex]
                result.append(report)
       
    df = pd.DataFrame(data=result, columns=['precision', 'recall', 'f1score', 'estimator', 'queryid', 'factorex'])
    return df

#### Data set: [1993 New Car Data](http://jse.amstat.org/datasets/93cars.txt)

In [None]:
cartable = pd.read_pickle(os.path.join('datasets', 'car_original_dataset.pkl'))
cartable.columns = [str.lower(col.replace('.', '_')) for col in cartable.columns]
cartable['origin'] = cartable['origin'].map({0: False, 1: True})
cartable['automatic_gearbox'] = cartable['automatic_gearbox'].map({0: False, 1: True})

preprocessed_data = pd.read_pickle(os.path.join('datasets', '1993CarsPrep.pkl'))

queries = [
    "type != 'Sporty' and origin == 1",
    "automatic_gearbox == 1 and horsepower >= 150",
    "price <= 7000 and mpg >= 26 and automatic_gearbox == 0",
    "manufacturer == 'Ford' or manufacturer == 'Chevrolet'"
]

In [None]:
result_exp = experiment('1993Cars', cartable, preprocessed_data, queries, nexperiments=10)

#### Checking results
We show the average results collected during 10 experiments

In [None]:
result_exp.query('estimator=="SOM"').groupby(['factorex']).mean()[['f1score', 'precision', 'recall']]