# Selecting Data Provider
Publication information: Martins, D. M. L., Vossen, G., & de Lima Neto, F. B. (2017, August). Intelligent decision support for data purchase. In Proceedings of the International Conference on Web Intelligence (pp. 396-402).

URL: https://dl.acm.org/doi/10.1145/3106426.3106434

Publication's BibTeX:

### Configuring notebook

In [None]:
%pylab inline

In [None]:
cd ..

In [None]:
cd ..

#### Importing libraries

In [None]:
# Source: https://scikit-criteria.readthedocs.io/en/latest/index.html
from skcriteria import Data, MIN, MAX
from skcriteria.madm import closeness, simple
import pandas as pd
import os

#### Loading and configuring data set

In [None]:
providers = pd.read_pickle(os.path.join('datasets', 'uci-original-no-na.pkl'))
providers['Year'] = providers['Year'].astype('int32')
providers['NumInstances'] = providers['NumInstances'].astype('int32')
providers['Price'] = providers['Price']/15

#### We assume the user is interested in data sets of a specific Area

In [None]:
selected_providers = providers[providers.Area == 'Life']

In [None]:
preprocessed_data = pd.read_pickle(os.path.join('datasets', 'uci-preprocessed.pkl'))
preprocessed_data['Price'] = preprocessed_data['Price']/5

#### Configuring data for TOPSIS

In [None]:
# Use only the attributes that are important to the decision
columns_assessment_view = ['NumInstances', 'Year', 'MissingValues', 'Price']
assessment_view = preprocessed_data.loc[selected_providers.index][columns_assessment_view]
# Get names of the selected options
names = selected_providers.Name.values

In [None]:
criteria = [MAX, MAX, MIN, MIN]
weights = [0.25, 0.25, 0.25, 0.25] # Equal weights for TOPSIS

In [None]:
mcda_data = Data(assessment_view.as_matrix().tolist(), criteria, weights=weights, anames=names, cnames=columns_assessment_view)

#### Creating TOPSIS

In [None]:
model = closeness.TOPSIS()
decision = model.decide(mcda_data)

In [None]:
print("Ideal:", decision.e_.ideal)
print("Anti-Ideal:", decision.e_.anti_ideal)
print("Closeness:", decision.e_.closeness)

In [None]:
best_alternative = selected_providers.iloc[int(decision.best_alternative_)]
selected_providers.loc[selected_providers['Name'] == best_alternative['Name']]

In [None]:
selected_providers['TopsisRank'] = decision.rank_

In [None]:
topsis_result = selected_providers.sort_values(by='TopsisRank')
topsis_result.head(10)

#### Showing that modeling preferences as hard constraints is of no help in this case

In [None]:
selected_providers.query("MissingValues =='No' and Price <= 190 and NumInstances >= 200 and Year >= 1990")

## Using the iSM

### Selecting providers using preferences as soft constraints

In [None]:
def evaluate(provider, iteration):
    budget = 190
    year = 1990
    rows = 200
    
    relnulval = int(provider['MissingValues'] == 'No')
    
    relprice = budget - provider['Price']
    
    relyear = provider['Year'] - year 
    
    relrows = provider['NumInstances'] - rows
    
    if iteration < 2:
        relnulval = 0
        
    if iteration < 3:
        relrows = 0
    
    score = relprice/max(selected_providers['Price']) + relyear/max(selected_providers['Year']) + relrows/max(selected_providers['NumInstances']) + relnulval
    
    return float(str.format('{:.3f}',  score))

In [None]:
selected_providers['Relevance'] = [evaluate(selected_providers.iloc[i], iteration=3) for i in range(selected_providers.shape[0])]

In [None]:
ism_selected = selected_providers.sort_values(by='Relevance', ascending=False)[:10]
ism_selected

#### Select pareto-front 

In [None]:
def identify_pareto(scores):
    # Count number of items
    population_size = scores.shape[0]
    # Create a NumPy index for scores on the pareto front (zero indexed)
    population_ids = np.arange(population_size)
    # Create a starting list of items on the Pareto front
    # All items start off as being labelled as on the Parteo front
    pareto_front = np.ones(population_size, dtype=bool)
    # Loop through each item. This will then be compared with all other items
    for i in range(population_size):
        # Loop through all other items
        for j in range(population_size):
            # Check if our 'i' pint is dominated by out 'j' point
            if all(scores[j] >= scores[i]) and any(scores[j] > scores[i]):
                # j dominates i. Label 'i' point as not on Pareto front
                pareto_front[i] = 0
                # Stop further comparisons with 'i' (no more comparisons needed)
                break
    # Return ids of scenarios on pareto front
    return population_ids[pareto_front]

In [None]:
pareto_columns = [columns_assessment_view[0], columns_assessment_view[-1]]
pareto_criteria = [criteria[0], criteria[-1]]
data_pareto_analysis = assessment_view[pareto_columns].loc[ism_selected.index].copy(deep=True)

In [None]:
for i in range(len(pareto_criteria)):
    if pareto_criteria[i] == MIN:
        data_pareto_analysis[data_pareto_analysis.columns[i]] = 1/data_pareto_analysis[data_pareto_analysis.columns[i]] 

In [None]:
pareto_index = identify_pareto(data_pareto_analysis.values)

In [None]:
ism_pareto = ism_selected.iloc[pareto_index].sort_values(by='Relevance', ascending=False)
ism_pareto

#### Discriminating providers with SOM

In [None]:
import somoclu

In [None]:
data = preprocessed_data.loc[selected_providers.index].copy(deep=True)
del data['Name']

data = np.apply_along_axis(lambda x: x/np.linalg.norm(x), 1, data)

x_size = 8
y_size = 4
sigma = max(x_size, y_size)*0.2
labels = [providers.iloc[i]['Name'] for i in range(providers.shape[0])]

som = somoclu.Somoclu(n_columns=x_size, n_rows=y_size, compactsupport=False, initialization='pca')
%time som.train(data=data, epochs=1000, radius0=sigma)

In [None]:
fig = som.view_umatrix(bestmatches=True, labels=[selected_providers.loc[i]['Name'] if i in ism_pareto.index else '' for i in selected_providers.index], colorbar=True, figsize=(x_size*2, y_size*2))

In [None]:
fig = som.view_umatrix(bestmatches=True, labels=selected_providers.index, colorbar=True, figsize=(x_size*2, y_size*2))

#### Getting similar data providers based on SOM 

In [None]:
def get_neighborhood(centroid_2d_position, step, x_size, y_size):
    search_space = [(centroid_2d_position[0], centroid_2d_position[1]),
                    (centroid_2d_position[0], centroid_2d_position[1]-step), 
                    (centroid_2d_position[0]-step, centroid_2d_position[1]-step),
                    (centroid_2d_position[0]-step, centroid_2d_position[1]),
                    (centroid_2d_position[0]-step, centroid_2d_position[1]+step), 
                    (centroid_2d_position[0], centroid_2d_position[1]+step),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]+step),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]-step)]

    neighborhood = []
    for region in search_space:
        if region[0] in range(x_size) and region[1] in range(y_size):
            neighborhood.append(list(region))

    return neighborhood

In [None]:
similar = {}
for j in range(ism_pareto.shape[0]):
    similar[ism_pareto.index[j]] = [selected_providers.index[i] for i in range(selected_providers.shape[0]) if som.bmus[i].tolist() in get_neighborhood(som.bmus[j-1], 0, x_size, y_size)]

In [None]:
ids = set()
for k,v in similar.items():
    for e in v:
        ids.add(e)
ids = list(ids)

In [None]:
providers.loc[[i for i in ids if not i in ism_selected.index]]

In [None]:
similar