# EXPERIMENTS WITH DECISION TREE

## PREPARING THE ENVIRONMENT

Importing the relevant libraries:

In [1]:
import re

import numpy as np
import pandas as pd

Declaring the auxiliary functions:

In [2]:
def extract_features(file):
    return [int(n)
            for n in re.findall(r'\d+(?=_)', file)]

Defining the constants:

In [3]:
COLS = [
    'instance' ,
    'obj_lnsa' ,
    'time_lnsa',
]

FEATURES = [
    'number_items' ,
    'items_sizes'  ,
    'graph_density',
]

NUMBER_ITEMS = {
    1 : 100,
    2 : 200,
}

SIZES = {
    1 :  1,
    2 : 20,
    3 : 50,
}

CONFLICTS = {
    0  :  0.0,
    1  :  0.1,
    2  :  0.2,
    3  :  0.3,
    4  :  0.4,
    5  :  0.5,
    6  :  0.6,
    7  :  0.7,
    8  :  0.8,
    9  :  0.9,
    10 : 0.95,
    11 : 0.99,
}

## PRE-PROCESSING

Loading the data:

In [4]:
data_lb = pd.read_csv('../out/lower_bounds.txt', delimiter=' ')

data_lb.head()

Unnamed: 0,instance,lower_bound
0,instances/train/Correia_Random_2_3_1_7.txt,15070
1,instances/train/Correia_Random_2_1_1_8.txt,10180
2,instances/train/Correia_Random_2_2_9_3.txt,12200
3,instances/train/Correia_Random_1_2_4_9.txt,6060
4,instances/train/Correia_Random_1_2_0_9.txt,6060


In [5]:
data_legado = pd.read_csv('../out/results_test_legado.txt', delimiter=' ', usecols=COLS)

data_legado.rename(columns={'obj_lnsa' : 'HC',
                            'time_lnsa': 'time-hc'}, inplace=True)

data_legado.head()

Unnamed: 0,instance,HC,time-hc
0,instances/test/Correia_Random_1_2_0_3.txt,6310,1
1,instances/test/Correia_Random_1_1_9_7.txt,5330,1
2,instances/test/Correia_Random_1_1_6_7.txt,4950,2
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,1


In [6]:
data_greedy1 = pd.read_csv('../out/results_test_greedy1.txt', delimiter=' ', usecols=COLS)

data_greedy1.rename(columns={'obj_lnsa' : 'HC1',
                             'time_lnsa': 'time-hc1'}, inplace=True)

data_greedy1.head()

Unnamed: 0,instance,HC1,time-hc1
0,instances/test/Correia_Random_1_2_0_3.txt,6310,0
1,instances/test/Correia_Random_1_1_9_7.txt,5290,1
2,instances/test/Correia_Random_1_1_6_7.txt,4940,1
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


In [7]:
data_greedy2 = pd.read_csv('../out/results_test_greedy2.txt', delimiter=' ', usecols=COLS)

data_greedy2.rename(columns={'obj_lnsa' : 'HC2',
                             'time_lnsa': 'time-hc2'}, inplace=True)

data_greedy2.head()

Unnamed: 0,instance,HC2,time-hc2
0,instances/test/Correia_Random_1_2_0_3.txt,6320,0
1,instances/test/Correia_Random_1_1_9_7.txt,5320,0
2,instances/test/Correia_Random_1_1_6_7.txt,4940,0
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


In [8]:
data_best = pd.DataFrame({
    'instance'  : data_greedy1.instance,
    'best'      : data_greedy2['HC2'].where(data_greedy2['HC2'] <= data_greedy1['HC1'],
                                            data_greedy1['HC1']),
    'time-best' : data_greedy2['time-hc2'].where(data_greedy2['time-hc2'] <= data_greedy1['time-hc1'],
                                                 data_greedy1['time-hc1'])
})

data_best.head()

Unnamed: 0,instance,best,time-best
0,instances/test/Correia_Random_1_2_0_3.txt,6310,0
1,instances/test/Correia_Random_1_1_9_7.txt,5290,0
2,instances/test/Correia_Random_1_1_6_7.txt,4940,0
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


In [9]:
data_discrete = pd.read_csv('../out/results_test_discrete.txt', delimiter=' ', usecols=COLS)

data_discrete.rename(columns={'obj_lnsa' : 'DISTREE',
                              'time_lnsa': 'time-discrete'}, inplace=True)

data_discrete.head()

Unnamed: 0,instance,DISTREE,time-discrete
0,instances/test/Correia_Random_1_2_0_3.txt,6310,0
1,instances/test/Correia_Random_1_1_9_7.txt,5310,1
2,instances/test/Correia_Random_1_1_6_7.txt,4940,1
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


Preprocessing the data:

In [10]:
data = (
    data_lb
    .merge(data_legado   , on='instance', how='inner')
    .merge(data_greedy1  , on='instance', how='inner')
    .merge(data_greedy2  , on='instance', how='inner')
    .merge(data_best     , on='instance', how='inner')
    .merge(data_discrete , on='instance', how='inner')
)

data[FEATURES] = data['instance'].apply(lambda x: pd.Series(extract_features(x)))
data['number_items' ] = data['number_items' ].map(NUMBER_ITEMS)
data['items_sizes'  ] = data['items_sizes'  ].map(SIZES)
data['graph_density'] = data['graph_density'].map(CONFLICTS)

data.drop(columns=['instance'], inplace=True)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,HC1,time-hc1,HC2,time-hc2,best,time-best,DISTREE,time-discrete,number_items,items_sizes,graph_density
0,6290,6310,1,6310,0,6320,0,6310,0,6310,0,100,20,0.0
1,4920,5330,1,5290,1,5320,0,5290,0,5310,1,100,1,0.9
2,4920,4950,2,4940,1,4940,0,4940,0,4940,1,100,1,0.6
3,7580,7900,0,7900,0,7900,0,7900,0,7900,0,100,50,0.7
4,4610,4610,1,4610,0,4610,0,4610,0,4610,0,100,1,0.2


Obtaining gaps relative to the lower bound

$$
\text{Percentage Gap} = \frac{\text{[HC, TREE*]} - \text{LB}}{\text{[HC, TREE*]}} \times 100\%
$$

and obtaining the percentage of improvement with respect to the original algorithm

$$
\text{Percentage Gap} = \frac{\text{HC} - \text{TREE*}}{\text{TREE*}} \times 100\%
$$

In [11]:
data['gap-hc1'           ] = ((data.HC1     - data.lower_bound) / data.HC1      * 100).round(3)
data['improving-hc1'     ] = ((data.HC      - data.HC1        ) / data.HC1      * 100).round(3)
data['gap-hc2'           ] = ((data.HC2     - data.lower_bound) / data.HC2      * 100).round(3)
data['improving-hc2'     ] = ((data.HC      - data.HC2        ) / data.HC2      * 100).round(3)
data['gap-best'          ] = ((data.best    - data.lower_bound) / data.best     * 100).round(3)
data['improving-best'    ] = ((data.HC      - data.best       ) / data.best     * 100).round(3)
data['gap-discrete'      ] = ((data.DISTREE - data.lower_bound) / data.DISTREE  * 100).round(3)
data['improving-discrete'] = ((data.HC      - data.DISTREE    ) / data.DISTREE  * 100).round(3)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,HC1,time-hc1,HC2,time-hc2,best,time-best,DISTREE,...,items_sizes,graph_density,gap-hc1,improving-hc1,gap-hc2,improving-hc2,gap-best,improving-best,gap-discrete,improving-discrete
0,6290,6310,1,6310,0,6320,0,6310,0,6310,...,20,0.0,0.317,0.0,0.475,-0.158,0.317,0.0,0.317,0.0
1,4920,5330,1,5290,1,5320,0,5290,0,5310,...,1,0.9,6.994,0.756,7.519,0.188,6.994,0.756,7.345,0.377
2,4920,4950,2,4940,1,4940,0,4940,0,4940,...,1,0.6,0.405,0.202,0.405,0.202,0.405,0.202,0.405,0.202
3,7580,7900,0,7900,0,7900,0,7900,0,7900,...,50,0.7,4.051,0.0,4.051,0.0,4.051,0.0,4.051,0.0
4,4610,4610,1,4610,0,4610,0,4610,0,4610,...,1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## EXPERIMENTS

Experiment on

- Number of items
- Items sizes
- Conflict graph density

In [12]:
experiments_cols = [
    'gap-hc1'           ,
    'improving-hc1'     ,
    'gap-hc2'           ,
    'improving-hc2'     ,
    'gap-best'          ,
    'improving-best'    ,
    'gap-discrete'      ,
    'improving-discrete',
]

experiment_ni = (
    data
    .filter(items=['number_items'] + experiments_cols)
    .groupby('number_items')
    .mean()
    .reset_index()
    .round(3)
)

avg = data[experiments_cols].mean().round(3)
avg['number_items'] = 'avg'

experiment_ni = pd.concat([experiment_ni,
                           avg.to_frame().T],
                           ignore_index=True)

experiment_ni.columns = pd.MultiIndex.from_tuples([
    (''        , 'number_items'      ),
    ('HC1'     , 'gap-hc1'           ),
    (''        , 'improving-hc1'     ),
    ('HC2'     , 'gap-hc2'           ),
    (''        , 'improving-hc2'     ),
    ('BEST'    , 'gap-best'          ),
    (''        , 'improving-best'    ),
    ('DISCRETE', 'gap-discrete'      ),
    (''        , 'improving-discrete'),
])

experiment_ni

Unnamed: 0_level_0,Unnamed: 1_level_0,HC1,Unnamed: 3_level_0,HC2,Unnamed: 5_level_0,BEST,Unnamed: 7_level_0,DISCRETE,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc1,improving-hc1,gap-hc2,improving-hc2,gap-best,improving-best,gap-discrete,improving-discrete
0,100,4.325,0.031,4.336,0.018,4.297,0.061,4.288,0.071
1,200,2.995,0.115,3.02,0.087,2.981,0.129,3.019,0.089
2,avg,3.66,0.073,3.678,0.053,3.639,0.095,3.653,0.08


In [13]:
experiment_is = (
    data
    .filter(items=['items_sizes'] + experiments_cols)
    .groupby('items_sizes')
    .mean()
    .reset_index()
    .round(3)
)

avg = data[experiments_cols].mean().round(3)
avg['items_sizes'] = 'avg'

experiment_is = pd.concat([experiment_is,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_is.columns = pd.MultiIndex.from_tuples([
    (''        , 'number_items'      ),
    ('HC1'     , 'gap-hc1'           ),
    (''        , 'improving-hc1'     ),
    ('HC2'     , 'gap-hc2'           ),
    (''        , 'improving-hc2'     ),
    ('BEST'    , 'gap-best'          ),
    (''        , 'improving-best'    ),
    ('DISCRETE', 'gap-discrete'      ),
    (''        , 'improving-discrete'),
])

experiment_is

Unnamed: 0_level_0,Unnamed: 1_level_0,HC1,Unnamed: 3_level_0,HC2,Unnamed: 5_level_0,BEST,Unnamed: 7_level_0,DISCRETE,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc1,improving-hc1,gap-hc2,improving-hc2,gap-best,improving-best,gap-discrete,improving-discrete
0,1,4.445,0.166,4.477,0.13,4.419,0.194,4.42,0.194
1,20,3.079,0.066,3.102,0.04,3.052,0.094,3.087,0.057
2,50,3.456,-0.013,3.455,-0.012,3.445,-0.002,3.453,-0.01
3,avg,3.66,0.073,3.678,0.053,3.639,0.095,3.653,0.08


In [14]:
experiment_cg = (
    data
    .filter(items=['graph_density'] + experiments_cols)
    .groupby('graph_density')
    .mean()
    .reset_index()
    .round(2)
)

avg = data[experiments_cols].mean().round(2)
avg['graph_density'] = 'avg'

experiment_cg = pd.concat([experiment_cg,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_cg.columns = pd.MultiIndex.from_tuples([
    (''        , 'number_items'      ),
    ('HC1'     , 'gap-hc1'           ),
    (''        , 'improving-hc1'     ),
    ('HC2'     , 'gap-hc2'           ),
    (''        , 'improving-hc2'     ),
    ('BEST'    , 'gap-best'          ),
    (''        , 'improving-best'    ),
    ('DISCRETE', 'gap-discrete'      ),
    (''        , 'improving-discrete'),
])

experiment_cg

Unnamed: 0_level_0,Unnamed: 1_level_0,HC1,Unnamed: 3_level_0,HC2,Unnamed: 5_level_0,BEST,Unnamed: 7_level_0,DISCRETE,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc1,improving-hc1,gap-hc2,improving-hc2,gap-best,improving-best,gap-discrete,improving-discrete
0,0.0,0.53,0.01,0.54,-0.01,0.53,0.01,0.53,0.01
1,0.1,0.53,0.02,0.51,0.03,0.51,0.03,0.51,0.03
2,0.2,0.48,0.01,0.47,0.03,0.46,0.04,0.47,0.03
3,0.3,0.54,0.01,0.53,0.02,0.53,0.02,0.53,0.02
4,0.4,0.62,0.04,0.63,0.04,0.62,0.05,0.62,0.05
5,0.5,0.81,0.05,0.8,0.06,0.8,0.06,0.8,0.06
6,0.6,0.93,0.09,0.92,0.09,0.9,0.11,0.92,0.09
7,0.7,1.45,0.12,1.45,0.13,1.42,0.16,1.42,0.16
8,0.8,1.84,0.15,1.86,0.13,1.82,0.17,1.82,0.17
9,0.9,3.97,0.16,3.97,0.16,3.86,0.27,3.94,0.19


Getting latex from tables:

In [15]:
print(experiment_ni.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC1 &  & HC2 &  & BEST &  & DISCRETE &  \\
number_items & gap-hc1 & improving-hc1 & gap-hc2 & improving-hc2 & gap-best & improving-best & gap-discrete & improving-discrete \\
\midrule
100 & 4.325000 & 0.031000 & 4.336000 & 0.018000 & 4.297000 & 0.061000 & 4.288000 & 0.071000 \\
200 & 2.995000 & 0.115000 & 3.020000 & 0.087000 & 2.981000 & 0.129000 & 3.019000 & 0.089000 \\
avg & 3.660000 & 0.073000 & 3.678000 & 0.053000 & 3.639000 & 0.095000 & 3.653000 & 0.080000 \\
\bottomrule
\end{tabular}



In [16]:
print(experiment_is.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC1 &  & HC2 &  & BEST &  & DISCRETE &  \\
number_items & gap-hc1 & improving-hc1 & gap-hc2 & improving-hc2 & gap-best & improving-best & gap-discrete & improving-discrete \\
\midrule
1 & 4.445000 & 0.166000 & 4.477000 & 0.130000 & 4.419000 & 0.194000 & 4.420000 & 0.194000 \\
20 & 3.079000 & 0.066000 & 3.102000 & 0.040000 & 3.052000 & 0.094000 & 3.087000 & 0.057000 \\
50 & 3.456000 & -0.013000 & 3.455000 & -0.012000 & 3.445000 & -0.002000 & 3.453000 & -0.010000 \\
avg & 3.660000 & 0.073000 & 3.678000 & 0.053000 & 3.639000 & 0.095000 & 3.653000 & 0.080000 \\
\bottomrule
\end{tabular}



In [17]:
print(experiment_cg.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC1 &  & HC2 &  & BEST &  & DISCRETE &  \\
number_items & gap-hc1 & improving-hc1 & gap-hc2 & improving-hc2 & gap-best & improving-best & gap-discrete & improving-discrete \\
\midrule
0.000000 & 0.530000 & 0.010000 & 0.540000 & -0.010000 & 0.530000 & 0.010000 & 0.530000 & 0.010000 \\
0.100000 & 0.530000 & 0.020000 & 0.510000 & 0.030000 & 0.510000 & 0.030000 & 0.510000 & 0.030000 \\
0.200000 & 0.480000 & 0.010000 & 0.470000 & 0.030000 & 0.460000 & 0.040000 & 0.470000 & 0.030000 \\
0.300000 & 0.540000 & 0.010000 & 0.530000 & 0.020000 & 0.530000 & 0.020000 & 0.530000 & 0.020000 \\
0.400000 & 0.620000 & 0.040000 & 0.630000 & 0.040000 & 0.620000 & 0.050000 & 0.620000 & 0.050000 \\
0.500000 & 0.810000 & 0.050000 & 0.800000 & 0.060000 & 0.800000 & 0.060000 & 0.800000 & 0.060000 \\
0.600000 & 0.930000 & 0.090000 & 0.920000 & 0.090000 & 0.900000 & 0.110000 & 0.920000 & 0.090000 \\
0.700000 & 1.450000 & 0.120000 & 1.450000 & 0.130000 & 1.420000 & 0.160000 &