# EXPERIMENTS WITH DECISION TREE

## PREPARING THE ENVIRONMENT

Importing the relevant libraries:

In [1]:
import re

import numpy as np
import pandas as pd

Declaring the auxiliary functions:

In [2]:
def extract_features(file):
    return [int(n)
            for n in re.findall(r'\d+(?=_)', file)]

Defining the constants:

In [3]:
COLS = [
    'instance' ,
    'obj_lnsa' ,
    'time_lnsa',
]

FEATURES = [
    'number_items' ,
    'items_sizes'  ,
    'graph_density',
]

NUMBER_ITEMS = {
    1 : 100,
    2 : 200,
}

SIZES = {
    1 :  1,
    2 : 20,
    3 : 50,
}

CONFLICTS = {
    0  :  0.0,
    1  :  0.1,
    2  :  0.2,
    3  :  0.3,
    4  :  0.4,
    5  :  0.5,
    6  :  0.6,
    7  :  0.7,
    8  :  0.8,
    9  :  0.9,
    10 : 0.95,
    11 : 0.99,
}

## PRE-PROCESSING

Loading the data:

In [4]:
data_lb = pd.read_csv('../out/lower_bounds.txt', delimiter=' ')

data_lb.head()

Unnamed: 0,instance,lower_bound
0,instances/train/Correia_Random_2_3_1_7.txt,15070
1,instances/train/Correia_Random_2_1_1_8.txt,10180
2,instances/train/Correia_Random_2_2_9_3.txt,12200
3,instances/train/Correia_Random_1_2_4_9.txt,6060
4,instances/train/Correia_Random_1_2_0_9.txt,6060


In [5]:
data_legado_train = pd.read_csv('../out/results_train_legado.txt', delimiter=' ', usecols=COLS)
data_legado_test  = pd.read_csv('../out/results_test_legado.txt' , delimiter=' ', usecols=COLS)

data_legado = pd.concat([data_legado_train,
                         data_legado_test], ignore_index=True)
data_legado.rename(columns={'obj_lnsa' : 'HC',
                            'time_lnsa': 'time-hc'}, inplace=True)

data_legado.head()

Unnamed: 0,instance,HC,time-hc
0,instances/train/Correia_Random_2_3_1_7.txt,15350,3
1,instances/train/Correia_Random_2_1_1_8.txt,10180,7
2,instances/train/Correia_Random_2_2_9_3.txt,12540,13
3,instances/train/Correia_Random_1_2_4_9.txt,6080,2
4,instances/train/Correia_Random_1_2_0_9.txt,6070,1


In [6]:
data_unregtree_train = pd.read_csv('../out/results_unregtree_train.txt', delimiter=' ')
data_unregtree_test  = pd.read_csv('../out/results_unregtree_test.txt' , delimiter=' ')

data_unregtree = pd.concat([data_unregtree_train,
                            data_unregtree_test], ignore_index=True)
data_unregtree.rename(columns={'smart'     : 'UNREGTREE',
                               'time_smart': 'time-unregtree'}, inplace=True)

data_unregtree.head()

Unnamed: 0,instance,time-unregtree,UNREGTREE
0,instances/train/Correia_Random_2_3_1_7.txt,2,15350
1,instances/train/Correia_Random_2_1_1_8.txt,1,10370
2,instances/train/Correia_Random_2_2_9_3.txt,2,12900
3,instances/train/Correia_Random_1_2_4_9.txt,0,6220
4,instances/train/Correia_Random_1_2_0_9.txt,0,6240


In [7]:
data_regtree_train = pd.read_csv('../out/results_train_final.txt', delimiter=' ', usecols=COLS)
data_regtree_test  = pd.read_csv('../out/results_test_final.txt' , delimiter=' ', usecols=COLS)

data_regtree = pd.concat([data_regtree_train,
                          data_regtree_test], ignore_index=True)
data_regtree.rename(columns={'obj_lnsa' : 'REGTREE',
                             'time_lnsa': 'time-regtree'}, inplace=True)

data_regtree.head()

Unnamed: 0,instance,REGTREE,time-regtree
0,instances/train/Correia_Random_2_3_1_7.txt,15350,2
1,instances/train/Correia_Random_2_1_1_8.txt,10180,2
2,instances/train/Correia_Random_2_2_9_3.txt,12510,2
3,instances/train/Correia_Random_1_2_4_9.txt,6080,0
4,instances/train/Correia_Random_1_2_0_9.txt,6070,0


Preprocessing the data:

In [8]:
data = (
    data_lb
    .merge(data_legado   , on='instance', how='inner')
    .merge(data_unregtree, on='instance', how='inner')
    .merge(data_regtree  , on='instance', how='inner')
)

data[FEATURES] = data['instance'].apply(lambda x: pd.Series(extract_features(x)))
data['number_items' ] = data['number_items' ].map(NUMBER_ITEMS)
data['items_sizes'  ] = data['items_sizes'  ].map(SIZES)
data['graph_density'] = data['graph_density'].map(CONFLICTS)

data.drop(columns=['instance'], inplace=True)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,time-unregtree,UNREGTREE,REGTREE,time-regtree,number_items,items_sizes,graph_density
0,15070,15350,3,2,15350,15350,2,200,50,0.1
1,10180,10180,7,1,10370,10180,2,200,1,0.1
2,12200,12540,13,2,12900,12510,2,200,20,0.9
3,6060,6080,2,0,6220,6080,0,100,20,0.4
4,6060,6070,1,0,6240,6070,0,100,20,0.0


Obtaining gaps relative to the lower bound

$$
\text{Percentage Gap} = \frac{\text{[HC, REGTREE]} - \text{LB}}{\text{[HC, REGTREE]}} \times 100\%
$$

and obtaining the percentage of improvement with respect to the original algorithm

$$
\text{Percentage Gap} = \frac{\text{HC} - \text{REGTREE}}{\text{REGTREE}} \times 100\%
$$

In [9]:
data['gap-hc'             ] = ((data.HC        - data.lower_bound  ) / data.HC        * 100).round(2)
data['gap-unregtree'      ] = ((data.UNREGTREE - data.lower_bound  ) / data.UNREGTREE * 100).round(2)
data['improving-unregtree'] = ((data.HC        - data.UNREGTREE    ) / data.UNREGTREE * 100).round(2)
data['gap-regtree'        ] = ((data.REGTREE   - data.lower_bound  ) / data.REGTREE   * 100).round(2)
data['improving-regtree'  ] = ((data.HC        - data.REGTREE      ) / data.REGTREE   * 100).round(2)

data.head()

Unnamed: 0,lower_bound,HC,time-hc,time-unregtree,UNREGTREE,REGTREE,time-regtree,number_items,items_sizes,graph_density,gap-hc,gap-unregtree,improving-unregtree,gap-regtree,improving-regtree
0,15070,15350,3,2,15350,15350,2,200,50,0.1,1.82,1.82,0.0,1.82,0.0
1,10180,10180,7,1,10370,10180,2,200,1,0.1,0.0,1.83,-1.83,0.0,0.0
2,12200,12540,13,2,12900,12510,2,200,20,0.9,2.71,5.43,-2.79,2.48,0.24
3,6060,6080,2,0,6220,6080,0,100,20,0.4,0.33,2.57,-2.25,0.33,0.0
4,6060,6070,1,0,6240,6070,0,100,20,0.0,0.16,2.88,-2.72,0.16,0.0


## EXPERIMENTS

Experiment on

- Number of items
- Items sizes
- Conflict graph density

In [10]:
experiments_cols = [
    'gap-hc'             ,
    'time-hc'            ,
    'gap-unregtree'      ,
    'time-unregtree'     ,
    'improving-unregtree',
    'gap-regtree'        ,
    'time-regtree'       ,
    'improving-regtree'  ,
]


experiment_ni = (
    data
    .filter(items=['number_items'] + experiments_cols)
    .groupby('number_items')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_ni[experiments_cols].mean().round(2)
avg['number_items'] = 'avg'

experiment_ni = pd.concat([experiment_ni,
                           avg.to_frame().T],
                           ignore_index=True)

experiment_ni.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
])

experiment_ni

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree
0,100,4.4,0.9,6.94,0.22,-2.68,4.36,0.14,0.04
1,200,2.99,6.56,4.91,2.94,-2.04,2.89,2.48,0.1
2,avg,3.7,3.73,5.93,1.58,-2.36,3.62,1.31,0.07


In [11]:
experiment_is = (
    data
    .filter(items=['items_sizes'] + experiments_cols)
    .groupby('items_sizes')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_is[experiments_cols].mean().round(2)
avg['items_sizes'] = 'avg'

experiment_is = pd.concat([experiment_is,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_is.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
])

experiment_is

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree
0,1,4.41,4.72,7.89,1.96,-3.69,4.27,1.52,0.15
1,20,3.2,4.56,5.64,2.01,-2.57,3.12,1.61,0.08
2,50,3.47,1.91,4.24,0.78,-0.81,3.48,0.8,-0.01
3,avg,3.69,3.73,5.92,1.58,-2.36,3.62,1.31,0.07


In [12]:
experiment_cg = (
    data
    .filter(items=['graph_density'] + experiments_cols)
    .groupby('graph_density')
    .mean()
    .reset_index()
    .round(2)
)

avg = experiment_cg[experiments_cols].mean().round(2)
avg['graph_density'] = 'avg'

experiment_cg = pd.concat([experiment_cg,
                           avg.to_frame().T],
                          ignore_index=True)

experiment_cg.columns = pd.MultiIndex.from_tuples([
    (''         , 'number_items'       ),
    ('HC'       , 'gap-hc'             ),
    (''         , 'time-hc'            ),
    ('UNREGTREE', 'gap-unregtree'      ),
    (''         , 'time-unregtree'     ),
    (''         , 'improving-unregtree'),
    ('REGTREE'  , 'gap-regtree'        ),
    (''         , 'time-regtree'       ),
    (''         , 'improving-regtree'  ),
])

experiment_cg

Unnamed: 0_level_0,Unnamed: 1_level_0,HC,Unnamed: 3_level_0,UNREGTREE,Unnamed: 5_level_0,Unnamed: 6_level_0,REGTREE,Unnamed: 8_level_0,Unnamed: 9_level_0
Unnamed: 0_level_1,number_items,gap-hc,time-hc,gap-unregtree,time-unregtree,improving-unregtree,gap-regtree,time-regtree,improving-regtree
0,0.0,0.58,3.02,1.51,0.83,-0.92,0.57,0.73,0.01
1,0.1,0.57,3.32,1.68,0.93,-1.11,0.55,0.87,0.02
2,0.2,0.6,3.6,1.8,1.07,-1.2,0.57,0.92,0.03
3,0.3,0.61,3.48,2.11,1.25,-1.51,0.59,1.05,0.02
4,0.4,0.68,3.87,2.44,1.32,-1.77,0.64,1.28,0.04
5,0.5,0.76,4.1,2.64,1.52,-1.89,0.72,1.23,0.05
6,0.6,0.98,3.98,3.07,1.58,-2.11,0.92,1.17,0.06
7,0.7,1.34,3.9,3.83,1.88,-2.52,1.21,1.43,0.14
8,0.8,2.1,3.88,5.03,1.93,-2.99,1.96,1.6,0.15
9,0.9,4.29,3.82,8.27,1.98,-4.18,4.14,1.63,0.16


Getting latex from tables:

In [13]:
print(experiment_ni.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree \\
\midrule
100 & 4.400000 & 0.900000 & 6.940000 & 0.220000 & -2.680000 & 4.360000 & 0.140000 & 0.040000 \\
200 & 2.990000 & 6.560000 & 4.910000 & 2.940000 & -2.040000 & 2.890000 & 2.480000 & 0.100000 \\
avg & 3.700000 & 3.730000 & 5.930000 & 1.580000 & -2.360000 & 3.620000 & 1.310000 & 0.070000 \\
\bottomrule
\end{tabular}



In [14]:
print(experiment_is.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree \\
\midrule
1 & 4.410000 & 4.720000 & 7.890000 & 1.960000 & -3.690000 & 4.270000 & 1.520000 & 0.150000 \\
20 & 3.200000 & 4.560000 & 5.640000 & 2.010000 & -2.570000 & 3.120000 & 1.610000 & 0.080000 \\
50 & 3.470000 & 1.910000 & 4.240000 & 0.780000 & -0.810000 & 3.480000 & 0.800000 & -0.010000 \\
avg & 3.690000 & 3.730000 & 5.920000 & 1.580000 & -2.360000 & 3.620000 & 1.310000 & 0.070000 \\
\bottomrule
\end{tabular}



In [15]:
print(experiment_cg.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
 & HC &  & UNREGTREE & \multicolumn{2}{r}{} & REGTREE & \multicolumn{2}{r}{} \\
number_items & gap-hc & time-hc & gap-unregtree & time-unregtree & improving-unregtree & gap-regtree & time-regtree & improving-regtree \\
\midrule
0.000000 & 0.580000 & 3.020000 & 1.510000 & 0.830000 & -0.920000 & 0.570000 & 0.730000 & 0.010000 \\
0.100000 & 0.570000 & 3.320000 & 1.680000 & 0.930000 & -1.110000 & 0.550000 & 0.870000 & 0.020000 \\
0.200000 & 0.600000 & 3.600000 & 1.800000 & 1.070000 & -1.200000 & 0.570000 & 0.920000 & 0.030000 \\
0.300000 & 0.610000 & 3.480000 & 2.110000 & 1.250000 & -1.510000 & 0.590000 & 1.050000 & 0.020000 \\
0.400000 & 0.680000 & 3.870000 & 2.440000 & 1.320000 & -1.770000 & 0.640000 & 1.280000 & 0.040000 \\
0.500000 & 0.760000 & 4.100000 & 2.640000 & 1.520000 & -1.890000 & 0.720000 & 1.230000 & 0.050000 \\
0.600000 & 0.980000 & 3.980000 & 3.070000 & 1.580000 & -2.110000 & 0.920000 & 1.170000 & 0.060000 \\
0.700000 & 1.340000 & 3.90000