In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
cd ..

C:\Users\d_mart04\Google Drive\Doutorado\gitworkspace\phd-query-synthesis\PhDCoding


In [3]:
# Source: https://scikit-criteria.readthedocs.io/en/latest/index.html
from skcriteria import Data, MIN, MAX
from skcriteria.madm import closeness, simple
import pandas as pd
import os

In [4]:
providers = pd.read_pickle(os.path.join('datasets', 'uci-original-no-na.pkl'))
providers['Year'] = providers['Year'].astype('int32')
providers['NumInstances'] = providers['NumInstances'].astype('int32')
providers['Price'] = providers['Price']/15

providers.shape

(81, 10)

In [5]:
selected_providers = providers[providers.Area == 'Life']
selected_providers.shape

(37, 10)

In [6]:
preprocessed_data = pd.read_pickle(os.path.join('datasets', 'uci-preprocessed.pkl'))
preprocessed_data['Price'] = preprocessed_data['Price']/5

In [7]:
def export_to_latex(dataframe, relevance=False):
    latex_cols = ['Name', 'NumAttributes', 'NumInstances', 'Year', 'MissingValues', 'Price']
    if relevance:
        latex_cols = latex_cols + ['Relevance']
    else:
        latex_cols = latex_cols + ['TopsisRank']
        
    print(dataframe[latex_cols][:10].to_latex())

In [8]:
# Use only the attributes that are important to the decision
columns_assessment_view = ['NumInstances', 'Year', 'MissingValues', 'Price']
assessment_view = preprocessed_data.loc[selected_providers.index][columns_assessment_view]
# Get names of the selected options
names = selected_providers.Name.values

In [9]:
criteria = [MAX, MAX, MIN, MIN]
weights = [0.25, 0.25, 0.25, 0.25]

In [10]:
mcda_data = Data(assessment_view.as_matrix().tolist(), criteria, weights=weights, anames=names, cnames=columns_assessment_view)

  """Entry point for launching an IPython kernel.


In [11]:
model = closeness.TOPSIS()
decision = model.decide(mcda_data)
decision

ALT./CRIT.,NumInstances (max) W.0.25,Year (max) W.0.25,MissingValues (min) W.0.25,Price (min) W.0.25,Rank
Abalone,0.0010809,0.32,1,0.152868,36
Arrhythmia,0.000113492,0.44,0,0.0491349,10
Audiology (Standardized),5.47982e-05,0.2,0,0.012566,8
Breast Cancer,7.03807e-05,0.04,0,0.0129302,12
Breast Cancer Wisconsin (Original),0.00017764,0.2,0,0.0639592,21
Breast Cancer Wisconsin (Prognostic),4.75264e-05,0.32,0,0.0336915,11
Breast Cancer Wisconsin (Diagnostic),0.000143878,0.32,1,0.0822072,31
Contraceptive Method Choice,0.000378653,0.4,1,0.145365,34
Covertype,0.150889,0.44,1,0.157968,1
Dermatology,9.11572e-05,0.44,0,0.051466,13


In [12]:
print("Ideal:", decision.e_.ideal)
print("Anti-Ideal:", decision.e_.anti_ideal)
print("Closeness:", decision.e_.closeness)

Ideal: [0.24810041 0.09503479 0.         0.        ]
Anti-Ideal: [7.25942998e-06 0.00000000e+00 6.06339063e-02 8.76954370e-02]
Closeness: [0.11189142 0.27838117 0.28538815 0.27761721 0.24610005 0.27796498
 0.1767051  0.13709868 0.6945268  0.27675635 0.28909131 0.2102109
 0.20943372 0.25607867 0.28402352 0.25754313 0.19869944 0.27333287
 0.19792815 0.19077199 0.0756777  0.16024232 0.20624298 0.25308264
 0.27613711 0.25476927 0.26694235 0.17040871 0.22903104 0.22563521
 0.12661695 0.20862629 0.30263251 0.30782349 0.28884291 0.29951018
 0.34526627]


In [13]:
best_alternative = selected_providers.iloc[int(decision.best_alternative_)]
selected_providers.loc[selected_providers['Name'] == best_alternative['Name']]

Unnamed: 0,Name,DataTypes,DefaultTask,AttributeTypes,NumInstances,NumAttributes,Year,MissingValues,Area,Price
16,Covertype,Multivariate,Classification,"Categorical, Integer",581012,54,1998,No,Life,381.266667


In [14]:
selected_providers['TopsisRank'] = decision.rank_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
topsis_result = selected_providers.sort_values(by='TopsisRank')
topsis_result[:10]

Unnamed: 0,Name,DataTypes,DefaultTask,AttributeTypes,NumInstances,NumAttributes,Year,MissingValues,Area,Price,TopsisRank
16,Covertype,Multivariate,Classification,"Categorical, Integer",581012,54,1998,No,Life,381.266667,1
77,KEGG Metabolic Reaction Network (Undirected),"Multivariate, Univariate, Text","Classification, Regression, Clustering","Integer, Real",65554,29,2011,Yes,Life,307.733333,2
72,Plants,Multivariate,Clustering,Categorical,22632,70,2008,Yes,Life,293.0,3
69,Mammographic Mass,Multivariate,Classification,Integer,961,6,2007,Yes,Life,252.933333,4
75,p53 Mutants,Multivariate,Classification,Real,16772,5409,2010,Yes,Life,420.333333,5
19,Echocardiogram,Multivariate,Classification,"Categorical, Integer, Real",132,12,1989,Yes,Life,92.133333,6
74,Acute Inflammations,Multivariate,Classification,"Categorical, Integer",120,6,2009,No,Life,204.466667,7
4,Audiology (Standardized),Multivariate,Classification,Categorical,226,69,1992,Yes,Life,115.133333,8
25,Hepatitis,Multivariate,Classification,"Categorical, Integer, Real",155,19,1988,Yes,Life,100.666667,9
2,Arrhythmia,Multivariate,Classification,"Categorical, Integer, Real",452,279,1998,Yes,Life,182.066667,10


In [16]:
export_to_latex(topsis_result, relevance=False)

\begin{tabular}{lllrrlrr}
\toprule
{} &                                          Name & NumAttributes &  NumInstances &  Year & MissingValues &       Price &  TopsisRank \\
\midrule
16 &                                     Covertype &           54  &        581012 &  1998 &            No &  381.266667 &           1 \\
77 &  KEGG Metabolic Reaction Network (Undirected) &           29  &         65554 &  2011 &           Yes &  307.733333 &           2 \\
72 &                                        Plants &           70  &         22632 &  2008 &           Yes &  293.000000 &           3 \\
69 &                             Mammographic Mass &            6  &           961 &  2007 &           Yes &  252.933333 &           4 \\
75 &                                   p53 Mutants &         5409  &         16772 &  2010 &           Yes &  420.333333 &           5 \\
19 &                                Echocardiogram &           12  &           132 &  1989 &           Yes &   92.133333 &      

#### Hard constraints

In [17]:
selected_providers.query("MissingValues =='No' and Price <= 190 and NumInstances >= 200 and Year >= 1990")

Unnamed: 0,Name,DataTypes,DefaultTask,AttributeTypes,NumInstances,NumAttributes,Year,MissingValues,Area,Price,TopsisRank


## Using the iSM

### Selecting providers using preferences as soft constraints

In [18]:
def evaluate(provider, iteration):
    budget = 190
    year = 1990
    rows = 200
    
    relnulval = int(provider['MissingValues'] == 'No')
    
    relprice = budget - provider['Price']
    
    relyear = provider['Year'] - year 
    
    relrows = provider['NumInstances'] - rows
    
    if iteration < 2:
        relnulval = 0
        
    if iteration < 3:
        relrows = 0
    
    score = relprice/max(selected_providers['Price']) + relyear/max(selected_providers['Year']) + relrows/max(selected_providers['NumInstances']) + relnulval
    
    return float(str.format('{:.3f}',  score))

In [19]:
selected_providers['Relevance'] = [evaluate(selected_providers.iloc[i], iteration=3) for i in range(selected_providers.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
ism_selected = selected_providers.sort_values(by='Relevance', ascending=False)[:10]
ism_selected

Unnamed: 0,Name,DataTypes,DefaultTask,AttributeTypes,NumInstances,NumAttributes,Year,MissingValues,Area,Price,TopsisRank,Relevance
16,Covertype,Multivariate,Classification,"Categorical, Integer",581012,54,1998,No,Life,381.266667,1,1.549
64,Zoo,Multivariate,Classification,"Categorical, Integer",101,17,1990,No,Life,158.533333,26,1.075
29,Iris,Multivariate,Classification,Real,150,4,1988,No,Life,168.466667,28,1.05
34,Lymphography,Multivariate,Classification,Categorical,148,18,1988,No,Life,169.666667,29,1.047
36,Molecular Biology (Promoter Gene Sequences),"Sequential, Domain-Theory",Classification,Categorical,106,58,1990,No,Life,186.533333,30,1.008
20,Ecoli,Multivariate,Classification,Real,336,8,1996,No,Life,194.266667,24,0.993
74,Acute Inflammations,Multivariate,Classification,"Categorical, Integer",120,6,2009,No,Life,204.466667,7,0.975
53,Soybean (Small),Multivariate,Classification,Categorical,47,35,1987,No,Life,209.466667,32,0.952
56,SPECT Heart,Multivariate,Classification,Categorical,267,22,2001,No,Life,218.2,22,0.938
57,SPECTF Heart,Multivariate,Classification,Integer,267,44,2001,No,Life,225.8,23,0.92


In [21]:
export_to_latex(ism_selected, relevance=True)

\begin{tabular}{lllrrlrr}
\toprule
{} &                                         Name & NumAttributes &  NumInstances &  Year & MissingValues &       Price &  Relevance \\
\midrule
16 &                                    Covertype &           54  &        581012 &  1998 &            No &  381.266667 &      1.549 \\
64 &                                          Zoo &           17  &           101 &  1990 &            No &  158.533333 &      1.075 \\
29 &                                         Iris &            4  &           150 &  1988 &            No &  168.466667 &      1.050 \\
34 &                                 Lymphography &           18  &           148 &  1988 &            No &  169.666667 &      1.047 \\
36 &  Molecular Biology (Promoter Gene Sequences) &           58  &           106 &  1990 &            No &  186.533333 &      1.008 \\
20 &                                        Ecoli &            8  &           336 &  1996 &            No &  194.266667 &      0.993 \\
74 &

#### Select pareto-front 

In [22]:
def identify_pareto(scores):
    # Count number of items
    population_size = scores.shape[0]
    # Create a NumPy index for scores on the pareto front (zero indexed)
    population_ids = np.arange(population_size)
    # Create a starting list of items on the Pareto front
    # All items start off as being labelled as on the Parteo front
    pareto_front = np.ones(population_size, dtype=bool)
    # Loop through each item. This will then be compared with all other items
    for i in range(population_size):
        # Loop through all other items
        for j in range(population_size):
            # Check if our 'i' pint is dominated by out 'j' point
            if all(scores[j] >= scores[i]) and any(scores[j] > scores[i]):
                # j dominates i. Label 'i' point as not on Pareto front
                pareto_front[i] = 0
                # Stop further comparisons with 'i' (no more comparisons needed)
                break
    # Return ids of scenarios on pareto front
    return population_ids[pareto_front]

In [23]:
pareto_columns = [columns_assessment_view[0], columns_assessment_view[-1]]
pareto_criteria = [criteria[0], criteria[-1]]
data_pareto_analysis = assessment_view[pareto_columns].loc[ism_selected.index].copy(deep=True)

In [24]:
for i in range(len(pareto_criteria)):
    if pareto_criteria[i] == MIN:
        data_pareto_analysis[data_pareto_analysis.columns[i]] = 1/data_pareto_analysis[data_pareto_analysis.columns[i]] 

In [25]:
pareto_index = identify_pareto(data_pareto_analysis.values)

In [26]:
ism_pareto = ism_selected.iloc[pareto_index].sort_values(by='Relevance', ascending=False)
ism_pareto

Unnamed: 0,Name,DataTypes,DefaultTask,AttributeTypes,NumInstances,NumAttributes,Year,MissingValues,Area,Price,TopsisRank,Relevance
16,Covertype,Multivariate,Classification,"Categorical, Integer",581012,54,1998,No,Life,381.266667,1,1.549
64,Zoo,Multivariate,Classification,"Categorical, Integer",101,17,1990,No,Life,158.533333,26,1.075
29,Iris,Multivariate,Classification,Real,150,4,1988,No,Life,168.466667,28,1.05
20,Ecoli,Multivariate,Classification,Real,336,8,1996,No,Life,194.266667,24,0.993


#### Discriminating providers with SOM

In [27]:
import somoclu

If you installed Somoclu with pip on Windows, this typically means missing DLLs. Please refer to the documentation.


In [28]:
data = preprocessed_data.loc[selected_providers.index].copy(deep=True)
del data['Name']

data = np.apply_along_axis(lambda x: x/np.linalg.norm(x), 1, data)

x_size = 8
y_size = 4
sigma = max(x_size, y_size)*0.2
labels = [providers.iloc[i]['Name'] for i in range(providers.shape[0])]

som = somoclu.Somoclu(n_columns=x_size, n_rows=y_size, compactsupport=False, initialization='pca')
%time som.train(data=data, epochs=1000, radius0=sigma)



NameError: name 'wrap_train' is not defined

In [29]:
fig = som.view_umatrix(bestmatches=True, labels=[selected_providers.loc[i]['Name'] if i in ism_pareto.index else '' for i in selected_providers.index], colorbar=True, figsize=(x_size*2, y_size*2))

IndexError: too many indices for array

<Figure size 1152x576 with 0 Axes>

In [None]:
fig = som.view_umatrix(bestmatches=True, labels=selected_providers.index, colorbar=True, figsize=(x_size*2, y_size*2))

In [None]:
def get_neighborhood(centroid_2d_position, step, x_size, y_size):
    search_space = [(centroid_2d_position[0], centroid_2d_position[1]),
                    (centroid_2d_position[0], centroid_2d_position[1]-step), 
                    (centroid_2d_position[0]-step, centroid_2d_position[1]-step),
                    (centroid_2d_position[0]-step, centroid_2d_position[1]),
                    (centroid_2d_position[0]-step, centroid_2d_position[1]+step), 
                    (centroid_2d_position[0], centroid_2d_position[1]+step),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]+step),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]),
                    (centroid_2d_position[0]+step, centroid_2d_position[1]-step)]

    neighborhood = []
    for region in search_space:
        if region[0] in range(x_size) and region[1] in range(y_size):
            neighborhood.append(list(region))

    return neighborhood

In [None]:
similar = {}
for j in range(ism_pareto.shape[0]):
    similar[ism_pareto.index[j]] = [selected_providers.index[i] for i in range(selected_providers.shape[0]) if som.bmus[i].tolist() in get_neighborhood(som.bmus[j-1], 0, x_size, y_size)]

In [None]:
ids = set()
for k,v in similar.items():
    for e in v:
        ids.add(e)
ids = list(ids)

In [None]:
providers.loc[[i for i in ids if not i in ism_selected.index]]

In [None]:
similar

In [None]:
export_to_latex(selected_providers.loc[similar[20]], relevance=True)

In [None]:
x = []
for v in similar.values():
    x.extend(v)

export_to_latex(selected_providers.loc[x], relevance=True)

### Collaborative Preferences

In [None]:
from decision.clause import *
from decision.criterion import Criterion
from decision.somselector import SomSelector
from sada.decisionsada import DecisionSADA
from datamanagement.dataaccessobject import Dataset

In [None]:
dataset = Dataset(selected_providers, preprocessed_data.loc[selected_providers.index])

In [None]:
sada = DecisionSADA(dataset)

In [None]:
def print_preferences(preferences):
    for pref in preferences:
        print(pref.to_string())

class DecisionEntry(object):
    def __init__(self, preferences, criteria, optimal_candidates, success):
        self.preferences = preferences
        self.criteria = criteria
        self.optimal_candidates = optimal_candidates
        self.success = success

In [None]:
ism_pareto

In [None]:
def build_historical_data(sada):
    b_1 = DecisionEntry(preferences = [DiadicClause('Volkswagen', Operation.EQUALS, 1), DiadicClause('mpg', Operation.GREATER_THAN_EQUALS, 0.098)],
                        criteria = [Criterion('price', maximize=False, weight=0.6), Criterion('mpg', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = True)
    
    b_2 = DecisionEntry(preferences = [DiadicClause('price', Operation.LESS_THAN_EQUALS, 0.231193), DiadicClause('Sporty', Operation.EQUALS, 1), DiadicClause('Origin', Operation.EQUALS, 0)],
                        criteria = [Criterion('price', maximize=False, weight=0.6), Criterion('horsepower', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = True)

    b_3 = DecisionEntry(preferences = [DiadicClause('driver_passenger', Operation.EQUALS, 1), DiadicClause('Front', Operation.EQUALS, 1)],
                        criteria = [Criterion('price', maximize=False, weight=0.6), Criterion('passenger_capacity', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = 0)

    b_4 = DecisionEntry(preferences = [DiadicClause('price', Operation.LESS_THAN_EQUALS, 0.01), DiadicClause('Small', Operation.EQUALS, 1)],
                        criteria = [Criterion('price', maximize=False, weight=0.6), Criterion('mpg', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = 0)

    b_5 = DecisionEntry(preferences = [DiadicClause('price', Operation.LESS_THAN_EQUALS, 0.03), DiadicClause('Small', Operation.EQUALS, 1)],
                        criteria = [Criterion('length', maximize=False, weight=0.6), Criterion('mpg', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = 1)

    b_6 = DecisionEntry(preferences = [DiadicClause('Rear', Operation.EQUALS, 1), DiadicClause('Sporty', Operation.EQUALS, 1), DiadicClause('passenger_capacity', Operation.EQUALS, 0)],
                        criteria = [Criterion('RPM', maximize=True, weight=0.6), Criterion('horsepower', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = 1)

    b_7 = DecisionEntry(preferences = [DiadicClause('Compact', Operation.EQUALS, 1), DiadicClause('passenger_capacity', Operation.GREATER_THAN_EQUALS, 0.5), DiadicClause('Rear', Operation.EQUALS, 1)],
                        criteria = [Criterion('mpg', maximize=True, weight=0.6), Criterion('price', maximize=False, weight=0.4)],
                        optimal_candidates = None, success = 0)

    b_8 = DecisionEntry(preferences = [DiadicClause('luggage_capacity', Operation.GREATER_THAN_EQUALS, 0.875)],
                        criteria = [Criterion('mpg', maximize=True, weight=0.6), Criterion('fuel_tank_capacity', maximize=True, weight=0.4)],
                        optimal_candidates = None, success = 1)
    
    previous_decisions = [b_1, b_2, b_3, b_4, b_5, b_6, b_7, b_8]
    
    for b in previous_decisions:
        selected, optimal = sada.get_recommendations(b.preferences, b.criteria)
        b.optimal_candidates = optimal
    
    return previous_decisions