# EXPERIMENTS WITH DECISION TREE

## PREPARING THE ENVIRONMENT

Importing the relevant libraries:

In [1]:
import re

import numpy as np
import pandas as pd

Declaring the auxiliary functions:

In [2]:
def extract_features(file):
    return [int(n)
            for n in re.findall(r'\d+(?=_)', file)]

Defining the constants:

In [3]:
COLS = [
    'instance' ,
    'obj_lnsa' ,
    'time_lnsa',
]

FEATURES = [
    'number_items' ,
    'items_sizes'  ,
    'graph_density',
]

NUMBER_ITEMS = {
    1 : 100,
    2 : 200,
}

SIZES = {
    1 :  1,
    2 : 20,
    3 : 50,
}

CONFLICTS = {
    0  :  0.0,
    1  :  0.1,
    2  :  0.2,
    3  :  0.3,
    4  :  0.4,
    5  :  0.5,
    6  :  0.6,
    7  :  0.7,
    8  :  0.8,
    9  :  0.9,
    10 : 0.95,
    11 : 0.99,
}

## PRE-PROCESSING

Loading the data:

In [4]:
data_lb = pd.read_csv('../out/lower_bounds.txt', delimiter=' ')

data_lb.head()

Unnamed: 0,instance,lower_bound
0,instances/train/Correia_Random_2_3_1_7.txt,15070
1,instances/train/Correia_Random_2_1_1_8.txt,10180
2,instances/train/Correia_Random_2_2_9_3.txt,12200
3,instances/train/Correia_Random_1_2_4_9.txt,6060
4,instances/train/Correia_Random_1_2_0_9.txt,6060


In [5]:
data_legado = pd.read_csv('../out/results_test_legado.txt', delimiter=' ', usecols=COLS)

data_legado.rename(columns={'obj_lnsa' : 'HC',
                            'time_lnsa': 'time-hc'}, inplace=True)

data_legado.head()

Unnamed: 0,instance,HC,time-hc
0,instances/test/Correia_Random_1_2_0_3.txt,6310,1
1,instances/test/Correia_Random_1_1_9_7.txt,5330,1
2,instances/test/Correia_Random_1_1_6_7.txt,4950,2
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,1


In [6]:
data_unregtree = pd.read_csv('../out/results_unregtree_test.txt', delimiter=' ')

data_unregtree.rename(columns={'smart'     : 'UNREGTREE',
                               'time_smart': 'time-unregtree'}, inplace=True)

data_unregtree.head()

Unnamed: 0,instance,time-unregtree,UNREGTREE
0,instances/test/Correia_Random_1_2_0_3.txt,0,6390
1,instances/test/Correia_Random_1_1_9_7.txt,1,5780
2,instances/test/Correia_Random_1_1_6_7.txt,1,5310
3,instances/test/Correia_Random_1_3_7_4.txt,0,7900
4,instances/test/Correia_Random_1_1_2_4.txt,0,4700


In [7]:
data_regtree = pd.read_csv('../out/results_test_iterative.txt', delimiter=' ', usecols=COLS)

data_regtree.rename(columns={'obj_lnsa' : 'REGTREE',
                             'time_lnsa': 'time-regtree'}, inplace=True)

data_regtree.head()

Unnamed: 0,instance,REGTREE,time-regtree
0,instances/test/Correia_Random_1_2_0_3.txt,6310,0
1,instances/test/Correia_Random_1_1_9_7.txt,5290,0
2,instances/test/Correia_Random_1_1_6_7.txt,4940,0
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


In [8]:
data_discrete = pd.read_csv('../out/results_test_discrete.txt', delimiter=' ', usecols=COLS)

data_discrete.rename(columns={'obj_lnsa' : 'DISTREE',
                              'time_lnsa': 'time-discrete'}, inplace=True)

data_discrete.head()

Unnamed: 0,instance,DISTREE,time-discrete
0,instances/test/Correia_Random_1_2_0_3.txt,6310,0
1,instances/test/Correia_Random_1_1_9_7.txt,5310,1
2,instances/test/Correia_Random_1_1_6_7.txt,4940,1
3,instances/test/Correia_Random_1_3_7_4.txt,7900,0
4,instances/test/Correia_Random_1_1_2_4.txt,4610,0


Preprocessing the data:

In [9]:
data = (
    data_lb
    .merge(data_legado   , on='instance', how='inner')
    .merge(data_unregtree, on='instance', how='inner')
    .merge(data_regtree  , on='instance', how='inner')
    .merge(data_discrete , on='instance', how='inner')
)

data[FEATURES] = data['instance'].apply(lambda x: pd.Series(extract_features(x)))
data['number_items' ] = data['number_items' ].map(NUMBER_ITEMS)
data['items_sizes'  ] = data['items_sizes'  ].map(SIZES)
data['graph_density'] = data['graph_density'].map(CONFLICTS)

data.drop(columns=['instance'], inplace=True)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,time-unregtree,UNREGTREE,REGTREE,time-regtree,DISTREE,time-discrete,number_items,items_sizes,graph_density
0,6290,6310,1,0,6390,6310,0,6310,0,100,20,0.0
1,4920,5330,1,1,5780,5290,0,5310,1,100,1,0.9
2,4920,4950,2,1,5310,4940,0,4940,1,100,1,0.6
3,7580,7900,0,0,7900,7900,0,7900,0,100,50,0.7
4,4610,4610,1,0,4700,4610,0,4610,0,100,1,0.2


Obtaining gaps relative to the lower bound

$$
\text{Percentage Gap} = \frac{\text{[HC, TREE*]} - \text{LB}}{\text{[HC, TREE*]}} \times 100\%
$$

and obtaining the percentage of improvement with respect to the original algorithm

$$
\text{Percentage Gap} = \frac{\text{HC} - \text{TREE*}}{\text{TREE*}} \times 100\%
$$

In [10]:
data['gap-hc'             ] = ((data.HC        - data.lower_bound  ) / data.HC        * 100).round(2)
data['gap-unregtree'      ] = ((data.UNREGTREE - data.lower_bound  ) / data.UNREGTREE * 100).round(2)
data['improving-unregtree'] = ((data.HC        - data.UNREGTREE    ) / data.UNREGTREE * 100).round(2)
data['gap-regtree'        ] = ((data.REGTREE   - data.lower_bound  ) / data.REGTREE   * 100).round(2)
data['improving-regtree'  ] = ((data.HC        - data.REGTREE      ) / data.REGTREE   * 100).round(2)
data['gap-discrete'       ] = ((data.DISTREE   - data.lower_bound  ) / data.DISTREE   * 100).round(2)
data['improving-discrete' ] = ((data.HC        - data.DISTREE      ) / data.DISTREE   * 100).round(2)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,time-unregtree,UNREGTREE,REGTREE,time-regtree,DISTREE,time-discrete,number_items,items_sizes,graph_density,gap-hc,gap-unregtree,improving-unregtree,gap-regtree,improving-regtree,gap-discrete,improving-discrete
0,6290,6310,1,0,6390,6310,0,6310,0,100,20,0.0,0.32,1.56,-1.25,0.32,0.0,0.32,0.0
1,4920,5330,1,1,5780,5290,0,5310,1,100,1,0.9,7.69,14.88,-7.79,6.99,0.76,7.34,0.38
2,4920,4950,2,1,5310,4940,0,4940,1,100,1,0.6,0.61,7.34,-6.78,0.4,0.2,0.4,0.2
3,7580,7900,0,0,7900,7900,0,7900,0,100,50,0.7,4.05,4.05,0.0,4.05,0.0,4.05,0.0
4,4610,4610,1,0,4700,4610,0,4610,0,100,1,0.2,0.0,1.91,-1.91,0.0,0.0,0.0,0.0


## EXPERIMENTS

Experiment on

- Number of items
- Items sizes
- Conflict graph density

In [11]:
experiments_cols = [
    'gap-hc'             ,
    'time-hc'            ,
    'gap-unregtree'      ,
    'time-unregtree'     ,
    'improving-unregtree',
    'gap-regtree'        ,
    'time-regtree'       ,
    'improving-regtree'  ,
    'gap-discrete'       ,
    'time-discrete'      ,
    'improving-discrete' ,
]


experiment_ni = (
    data
    .filter(items=['number_items'] + experiments_cols)
    .groupby('number_items')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_ni[experiments_cols].mean().round(3)
avg['number_items'] = 'avg'

experiment_ni = pd.concat([experiment_ni,
                           avg.to_frame().T],
                           ignore_index=True)

experiment_ni.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
    ('DISTREE'  , 'gap-discrete'       ),
    (''         , 'time-discrete'      ),
    (''         , 'improving-discrete' ),
])

experiment_ni

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0,DISTREE,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree,gap-discrete,time-discrete,improving-discrete
0,100,4.36,0.92,7.1,0.33,-2.88,4.31,0.14,0.05,4.29,0.26,0.07
1,200,3.1,6.61,4.98,3.33,-1.99,2.98,2.39,0.13,3.02,2.54,0.09
2,avg,3.73,3.765,6.04,1.83,-2.435,3.645,1.265,0.09,3.655,1.4,0.08


In [12]:
experiment_is = (
    data
    .filter(items=['items_sizes'] + experiments_cols)
    .groupby('items_sizes')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_is[experiments_cols].mean().round(3)
avg['items_sizes'] = 'avg'

experiment_is = pd.concat([experiment_is,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_is.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
    ('DISTREE'  , 'gap-discrete'       ),
    (''         , 'time-discrete'      ),
    (''         , 'improving-discrete' ),
])

experiment_is

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0,DISTREE,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree,gap-discrete,time-discrete,improving-discrete
0,1,4.6,4.71,8.18,2.29,-3.79,4.4,1.46,0.21,4.42,1.52,0.19
1,20,3.14,4.6,5.65,2.4,-2.62,3.07,1.58,0.08,3.09,1.81,0.06
2,50,3.44,1.98,4.29,0.81,-0.89,3.46,0.75,-0.02,3.45,0.88,-0.01
3,avg,3.727,3.763,6.04,1.833,-2.433,3.643,1.263,0.09,3.653,1.403,0.08


In [13]:
experiment_cg = (
    data
    .filter(items=['graph_density'] + experiments_cols)
    .groupby('graph_density')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_cg[experiments_cols].mean().round(2)
avg['graph_density'] = 'avg'

experiment_cg = pd.concat([experiment_cg,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_cg.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
    ('DISTREE'  , 'gap-discrete'       ),
    (''         , 'time-discrete'      ),
    (''         , 'improving-discrete' ),
])

experiment_cg

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0,DISTREE,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree,gap-discrete,time-discrete,improving-discrete
0,0.0,0.54,2.83,1.54,0.83,-1.01,0.53,0.75,0.01,0.53,0.67,0.01
1,0.1,0.55,3.33,1.9,1.0,-1.35,0.53,0.83,0.02,0.52,0.92,0.03
2,0.2,0.5,3.67,1.71,1.0,-1.22,0.48,0.75,0.02,0.47,1.0,0.03
3,0.3,0.55,3.67,2.16,1.58,-1.62,0.51,1.0,0.04,0.53,1.17,0.02
4,0.4,0.67,4.17,2.61,1.5,-1.95,0.63,1.33,0.04,0.62,1.17,0.05
5,0.5,0.86,3.75,2.66,1.67,-1.81,0.81,1.08,0.05,0.8,1.17,0.06
6,0.6,1.01,3.83,3.19,2.08,-2.19,0.92,1.0,0.09,0.92,1.33,0.09
7,0.7,1.57,4.33,3.92,2.33,-2.37,1.41,1.5,0.16,1.42,1.5,0.16
8,0.8,1.98,3.83,5.41,2.33,-3.5,1.83,1.92,0.16,1.82,1.83,0.17
9,0.9,4.12,4.08,8.5,1.92,-4.61,3.88,1.33,0.25,3.94,2.0,0.19


Getting latex from tables:

In [14]:
print(experiment_ni.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} & DISTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree & gap-discrete & time-discrete & improving-discrete \\
\midrule
100 & 4.360000 & 0.920000 & 7.100000 & 0.330000 & -2.880000 & 4.310000 & 0.140000 & 0.050000 & 4.290000 & 0.260000 & 0.070000 \\
200 & 3.100000 & 6.610000 & 4.980000 & 3.330000 & -1.990000 & 2.980000 & 2.390000 & 0.130000 & 3.020000 & 2.540000 & 0.090000 \\
avg & 3.730000 & 3.765000 & 6.040000 & 1.830000 & -2.435000 & 3.645000 & 1.265000 & 0.090000 & 3.655000 & 1.400000 & 0.080000 \\
\bottomrule
\end{tabular}



In [15]:
print(experiment_is.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} & DISTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree & gap-discrete & time-discrete & improving-discrete \\
\midrule
1 & 4.600000 & 4.710000 & 8.180000 & 2.290000 & -3.790000 & 4.400000 & 1.460000 & 0.210000 & 4.420000 & 1.520000 & 0.190000 \\
20 & 3.140000 & 4.600000 & 5.650000 & 2.400000 & -2.620000 & 3.070000 & 1.580000 & 0.080000 & 3.090000 & 1.810000 & 0.060000 \\
50 & 3.440000 & 1.980000 & 4.290000 & 0.810000 & -0.890000 & 3.460000 & 0.750000 & -0.020000 & 3.450000 & 0.880000 & -0.010000 \\
avg & 3.727000 & 3.763000 & 6.040000 & 1.833000 & -2.433000 & 3.643000 & 1.263000 & 0.090000 & 3.653000 & 1.403000 & 0.080000 \\
\bottomrule
\end{tabular}



In [16]:
print(experiment_cg.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} & DISTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree & gap-discrete & time-discrete & improving-discrete \\
\midrule
0.000000 & 0.540000 & 2.830000 & 1.540000 & 0.830000 & -1.010000 & 0.530000 & 0.750000 & 0.010000 & 0.530000 & 0.670000 & 0.010000 \\
0.100000 & 0.550000 & 3.330000 & 1.900000 & 1.000000 & -1.350000 & 0.530000 & 0.830000 & 0.020000 & 0.520000 & 0.920000 & 0.030000 \\
0.200000 & 0.500000 & 3.670000 & 1.710000 & 1.000000 & -1.220000 & 0.480000 & 0.750000 & 0.020000 & 0.470000 & 1.000000 & 0.030000 \\
0.300000 & 0.550000 & 3.670000 & 2.160000 & 1.580000 & -1.620000 & 0.510000 & 1.000000 & 0.040000 & 0.530000 & 1.170000 & 0.020000 \\
0.400000 & 0.670000 & 4.170000 & 2.610000 & 1.500000 & -1.950000 & 0.630000 & 1.330000 & 0.040000 & 0.620000 & 1