# Decision Tree (DT)

For the DT classifier, we are using the original data (without preprocessing, i.e, without partitioning the numerical attributes). However, the sklearn DT does not accept missing values. Therefore, we are imputing the missing values.

**Importing libraries**

In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score

For each *fold* of the 10-fold cross validation:
- Impute the missing values (sklearn.tree.DecisionTreeClassifier needs it)
- Train the classifier
- Predict the labels in the test dataset

### All 119 features

In [2]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

attrInfo = pd.read_csv('../data/attrInfo.csv', sep =',', header=0, index_col=None)

res = np.zeros(shape=[10,6])
print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
for i in range(10):
    dfTr = pd.read_csv('../data/folds/dataConcat.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
    dfTs = pd.read_csv('../data/folds/dataConcat.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
    
    idxlabel = dfTr.shape[1] - 1
    
    Xtrain = dfTr.iloc[:,:idxlabel]
    Ytrain = dfTr.iloc[:,idxlabel]
    Xtest = dfTs.iloc[:,:idxlabel]
    Ytest = dfTs.iloc[:,idxlabel]
    
    imp.fit(Xtrain)
    Xtrain = imp.transform(Xtrain)
    Xtest = imp.transform(Xtest)
    
    inicio = time.time()
    clf.fit(Xtrain, Ytrain)
    timeTrain = time.time() - inicio
    
    inicio = time.time()
    Ypred = clf.predict(Xtest)
    timePred = time.time() - inicio
    
    attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
    cost = sum(attrInfo['Time Cost Median'][attrs])
    
    acc = accuracy_score(Ytest, Ypred)
    bacc = balanced_accuracy_score(Ytest, Ypred)
    
    predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
    predictions.to_csv('../ResultsOther/predictions/dataConcat.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)
    
    print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
    res[i][0] = acc
    res[i][1] = bacc
    res[i][2] = clf.get_n_leaves()
    res[i][3] = len(attrs)
    res[i][4] = clf.tree_.node_count
    res[i][5] = cost

print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
print(np.mean(res, axis=0))

Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
30 77 56 153 0.9842342342342343 0.9796801346801348 1.49116044 0.11958742141723633 0.00023794174194335938
27 70 47 139 0.990990990990991 0.9897130216897659 1.118574296 0.10860872268676758 0.0002472400665283203
26 70 51 139 0.9909706546275395 0.9905495925263368 1.2957054239999999 0.11254239082336426 0.0002579689025878906
29 82 56 163 0.9841986455981941 0.9825319802149071 1.481150324 0.11832857131958008 0.00025534629821777344
29 77 52 153 0.9887133182844243 0.9880370307199576 1.471249644 0.11573481559753418 0.00024199485778808594
26 74 50 147 0.9954853273137697 0.9933333333333334 1.118827196 0.12346744537353516 0.0002422332763671875
26 70 52 139 0.9864559819413092 0.9869403496232765 1.86320622 0.12191295623779297 0.00032019615173339844
28 74 53 147 0.9796839729119639 0.9784632034632035 1.118852324 0.11664390563964844 0.00031948089599609375
25 73 53 145 0.9932279909706546 0.9936507936507937 1.

### With only the selected features

In [3]:
algsFS = ['FSbAC', 'KBestChi2', 'KBestMI', 'RFE', 'SFSbackward', 'SFSforward']

In [4]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

for alg in algsFS:
    print(alg)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        attrInfo = pd.read_csv('../data/folds/attrInfo'+alg+'.p'+str(i)+'.csv', sep =',', header=0, index_col=None)
        dfTr = pd.read_csv('../data/folds/dataConcat'+alg+'.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcat'+alg+'.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcat'+alg+'.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n')

FSbAC
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
16 49 18 97 0.9954954954954955 0.9947474747474748 0.362912292 0.01946568489074707 0.00021266937255859375
16 54 20 107 0.9977477477477478 0.9972222222222221 0.372910572 0.020594120025634766 0.00018262863159179688
16 53 20 105 0.9977426636568849 0.9972222222222221 0.3727917 0.020489931106567383 0.00012373924255371094
18 74 17 147 0.9932279909706546 0.9926060606060606 0.37286230800000003 0.01697707176208496 0.00013399124145507812
20 79 22 157 0.9932279909706546 0.9933982683982684 0.549898364 0.02394270896911621 0.0002353191375732422
19 65 23 129 0.9954853273137697 0.9957729468599034 0.912303936 0.025101900100708008 0.00014925003051757812
21 64 22 127 0.9887133182844243 0.9874814751644021 0.54004342 0.01938939094543457 0.00013303756713867188
16 53 22 105 0.9841986455981941 0.9802641802641803 0.37287560000000003 0.0211946964263916 0.00021004676818847656
20 68 21 135 0.9932279909706546 0.9

### Without the *Entry Bytes EB* attributes

In [5]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

attrInfo = pd.read_csv('../data/attrInfoWoEB.csv', sep =',', header=0, index_col=None)

res = np.zeros(shape=[10,6])
print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
for i in range(10):
    dfTr = pd.read_csv('../data/folds/dataConcatWoEB.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
    dfTs = pd.read_csv('../data/folds/dataConcatWoEB.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
    
    idxlabel = dfTr.shape[1] - 1
    
    Xtrain = dfTr.iloc[:,:idxlabel]
    Ytrain = dfTr.iloc[:,idxlabel]
    Xtest = dfTs.iloc[:,:idxlabel]
    Ytest = dfTs.iloc[:,idxlabel]
    
    imp.fit(Xtrain)
    Xtrain = imp.transform(Xtrain)
    Xtest = imp.transform(Xtest)
    
    inicio = time.time()
    clf.fit(Xtrain, Ytrain)
    timeTrain = time.time() - inicio
    
    inicio = time.time()
    Ypred = clf.predict(Xtest)
    timePred = time.time() - inicio
    
    attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
    cost = sum(attrInfo['Time Cost Median'][attrs])
    
    acc = accuracy_score(Ytest, Ypred)
    bacc = balanced_accuracy_score(Ytest, Ypred)
    
    predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
    predictions.to_csv('../ResultsOther/predictions/dataConcatWoEB.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)
    
    print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
    res[i][0] = acc
    res[i][1] = bacc
    res[i][2] = clf.get_n_leaves()
    res[i][3] = len(attrs)
    res[i][4] = clf.tree_.node_count
    res[i][5] = cost

print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
print(np.mean(res, axis=0))

Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
20 68 34 135 0.9864864864864865 0.9824579124579125 1.676521284 0.04555988311767578 0.00023746490478515625
20 69 32 137 0.990990990990991 0.9877104377104378 1.851753168 0.042299747467041016 0.00019168853759765625
20 72 34 143 0.9954853273137697 0.9941919191919193 1.851777632 0.04744386672973633 0.0002193450927734375
19 69 32 137 0.9932279909706546 0.9936304006071448 1.861644412 0.04486560821533203 0.00021719932556152344
20 71 32 141 0.9887133182844243 0.9878038628038629 2.02888164 0.042035818099975586 0.00018525123596191406
19 68 35 135 0.9954853273137697 0.995 1.8616695399999998 0.044850826263427734 0.0002200603485107422
19 67 32 133 0.9932279909706546 0.9916278166278167 1.851776304 0.04444718360900879 0.00022029876708984375
20 71 33 141 0.9729119638826185 0.9690420690420691 2.038773548 0.04037308692932129 0.0001633167266845703
20 70 34 139 0.9909706546275395 0.9881313131313132 1.861738283

### Without the *Entry Bytes EB* attributes + Feature Selection

In [6]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

for alg in algsFS:
    print(alg)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        attrInfo = pd.read_csv('../data/folds/attrInfoWoEB'+alg+'.p'+str(i)+'.csv', sep =',', header=0, index_col=None)
        dfTr = pd.read_csv('../data/folds/dataConcatWoEB'+alg+'.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcatWoEB'+alg+'.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcatWoEB'+alg+'.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n')

FSbAC
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
17 57 27 113 0.990990990990991 0.9880134680134681 1.479443724 0.03138422966003418 0.0002048015594482422
20 66 24 131 0.990990990990991 0.9877104377104378 1.84169466 0.03174591064453125 0.0002713203430175781
16 62 22 123 0.9977426636568849 0.9992248062015503 1.489239104 0.027389049530029297 0.0001995563507080078
20 64 27 127 0.9887133182844243 0.9878223197990639 1.4893118319999998 0.03084874153137207 0.00020194053649902344
16 61 29 121 0.9932279909706546 0.9931457431457431 1.489382568 0.027435779571533203 0.00014448165893554688
16 60 28 119 0.9977426636568849 0.9977777777777778 1.4794443879999999 0.025309324264526367 0.00020623207092285156
17 56 23 111 0.9887133182844243 0.9883089133089134 1.117049324 0.02585005760192871 0.00021028518676757812
17 59 24 117 0.9796839729119639 0.9739121989121989 1.4793723239999998 0.02217698097229004 0.00012803077697753906
19 67 26 133 0.993227990970654

## Testing the selected features for specific class labels (binary classification)

In [2]:
labels = ['armadillo', 'kkrunchy', 'vmprotect', 'upx']

### All features

In [4]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

attrInfo = pd.read_csv('../data/attrInfo.csv', sep =',', header=0, index_col=None)

for label in labels:
    print(label)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        dfTr = pd.read_csv('../data/folds/dataConcatL'+label+'.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcatL'+label+'.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcatL'+label+'.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n')

armadillo
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
3 4 3 7 1.0 1.0 8.4564e-05 0.012722492218017578 0.000186920166015625
3 4 3 7 1.0 1.0 8.4564e-05 0.01266789436340332 0.0002048015594482422
3 4 3 7 1.0 1.0 8.4564e-05 0.013150930404663086 0.0001919269561767578
3 4 3 7 1.0 1.0 8.4564e-05 0.012818098068237305 0.0001995563507080078
3 4 3 7 1.0 1.0 8.4564e-05 0.012827396392822266 0.0001976490020751953
3 4 3 7 1.0 1.0 8.4564e-05 0.013693809509277344 0.00020194053649902344
3 4 3 7 0.9977426636568849 0.9807692307692308 6.1428e-05 0.012836217880249023 0.000171661376953125
3 4 3 7 1.0 1.0 8.4564e-05 0.012960195541381836 0.0002110004425048828
3 4 3 7 1.0 1.0 8.4564e-05 0.013951778411865234 0.0002181529998779297
3 4 3 7 1.0 1.0 8.4564e-05 0.012509346008300781 0.00020003318786621094

Accuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost
[9.99774266e-01 9.98076923e-01 4.00000000e+00 3.00000000e+00
 7.00000000e+00 8.22504000e-05]


kkrunc

### With the features selected for each class label

In [6]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

for label in labels:
    print(label)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        attrInfo = pd.read_csv('../data/folds/attrInfoL'+label+'FS.p'+str(i)+'.csv', sep =',', header=0, index_col=None)
        dfTr = pd.read_csv('../data/folds/dataConcatL'+label+'FS.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcatL'+label+'FS.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcatL'+label+'FS.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n\n')

armadillo
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
4 5 3 9 1.0 1.0 6.1428e-05 0.0012750625610351562 0.00014090538024902344
4 5 3 9 1.0 1.0 6.1428e-05 0.0012784004211425781 0.00012230873107910156
4 5 3 9 1.0 1.0 6.1428e-05 0.0011413097381591797 0.00010657310485839844
4 5 3 9 0.9977426636568849 0.98 4.8264e-05 0.0007443428039550781 9.560585021972656e-05
4 5 3 9 1.0 1.0 6.1428e-05 0.0010075569152832031 9.775161743164062e-05
4 5 3 9 1.0 1.0 6.1428e-05 0.0009391307830810547 8.893013000488281e-05
4 5 3 9 1.0 1.0 6.1428e-05 0.00089263916015625 8.20159912109375e-05
4 5 3 9 1.0 1.0 6.1428e-05 0.0008301734924316406 7.891654968261719e-05
4 5 3 9 1.0 1.0 2.5128e-05 0.0009186267852783203 8.20159912109375e-05
4 5 3 9 1.0 1.0 6.1428e-05 0.0008301734924316406 7.843971252441406e-05

Accuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost
[9.99774266e-01 9.98000000e-01 5.00000000e+00 3.00000000e+00
 9.00000000e+00 5.64816000e-05]



kkrunchy


### WoEB

In [7]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

attrInfo = pd.read_csv('../data/attrInfoWoEB.csv', sep =',', header=0, index_col=None)

for label in labels:
    print(label)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        dfTr = pd.read_csv('../data/folds/dataConcatWoEBL'+label+'.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcatWoEBL'+label+'.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcatWoEBL'+label+'.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n')

armadillo
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
2 3 2 5 1.0 1.0 0.724646 0.006119728088378906 0.00013566017150878906
2 3 2 5 1.0 1.0 0.724646 0.005777120590209961 0.00020170211791992188
2 3 2 5 1.0 1.0 0.724646 0.005443096160888672 0.00013017654418945312
2 3 2 5 1.0 1.0 0.724646 0.0052945613861083984 0.0001270771026611328
2 3 2 5 1.0 1.0 0.724646 0.00833892822265625 0.0002818107604980469
2 3 2 5 1.0 1.0 0.724646 0.005766630172729492 0.0001838207244873047
2 3 2 5 1.0 1.0 0.724646 0.005287647247314453 0.0001277923583984375
2 3 2 5 0.9954853273137697 0.9976019184652278 0.724646 0.005346059799194336 0.00012373924255371094
2 3 2 5 1.0 1.0 0.724646 0.0055887699127197266 0.00013518333435058594
2 3 2 5 1.0 1.0 0.724646 0.0060389041900634766 0.0002486705780029297

Accuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost
[0.99954853 0.99976019 3.         2.         5.         0.724646  ]


kkrunchy
Depth - #Rules - #Attrs - #Operati

### WoEB + FS

In [8]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=100)

for label in labels:
    print(label)
    res = np.zeros(shape=[10,6])
    print('Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred')
    for i in range(10):
        attrInfo = pd.read_csv('../data/folds/attrInfoWoEBL'+label+'FS.p'+str(i)+'.csv', sep =',', header=0, index_col=None)
        dfTr = pd.read_csv('../data/folds/dataConcatWoEBL'+label+'FS.train.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)
        dfTs = pd.read_csv('../data/folds/dataConcatWoEBL'+label+'FS.test.p'+str(i)+'.csv', sep =',', header=None, na_values=-1, index_col=None)

        idxlabel = dfTr.shape[1] - 1

        Xtrain = dfTr.iloc[:,:idxlabel]
        Ytrain = dfTr.iloc[:,idxlabel]
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]

        imp.fit(Xtrain)
        Xtrain = imp.transform(Xtrain)
        Xtest = imp.transform(Xtest)

        inicio = time.time()
        clf.fit(Xtrain, Ytrain)
        timeTrain = time.time() - inicio

        inicio = time.time()
        Ypred = clf.predict(Xtest)
        timePred = time.time() - inicio

        attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
        cost = sum(attrInfo['Time Cost Median'][attrs])

        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)
        
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/dataConcatWoEBL'+label+'FS.p'+str(i)+'.DT.csv', sep=',', header=True, index=False)

        print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count, acc, bacc, cost, timeTrain, timePred)
        res[i][0] = acc
        res[i][1] = bacc
        res[i][2] = clf.get_n_leaves()
        res[i][3] = len(attrs)
        res[i][4] = clf.tree_.node_count
        res[i][5] = cost

    print('\nAccuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost')
    print(np.mean(res, axis=0))
    print('\n\n')

armadillo
Depth - #Rules - #Attrs - #Operations - Accuracy - B.Accuracy - Cost - RuntimeTrain - RuntimePred
4 5 3 9 1.0 1.0 7.14e-05 0.0011734962463378906 0.0001552104949951172
4 5 3 9 1.0 1.0 7.14e-05 0.0011143684387207031 0.00017452239990234375
4 5 3 9 1.0 1.0 7.14e-05 0.0009326934814453125 0.00010204315185546875
4 5 3 9 0.9977426636568849 0.98 4.8264e-05 0.0007343292236328125 9.274482727050781e-05
4 5 3 9 1.0 1.0 7.14e-05 0.0008573532104492188 8.96453857421875e-05
4 5 3 9 1.0 1.0 7.14e-05 0.0012793540954589844 0.0001327991485595703
4 5 3 9 1.0 1.0 7.14e-05 0.0007991790771484375 8.368492126464844e-05
4 5 3 9 1.0 1.0 7.14e-05 0.0007939338684082031 8.535385131835938e-05
4 5 3 9 1.0 1.0 2.5128e-05 0.0012125968933105469 9.512901306152344e-05
4 5 3 9 1.0 1.0 7.14e-05 0.0008509159088134766 8.916854858398438e-05

Accuracy - B.Accuracy - #Rules - #Attrs - #Operations - Cost
[9.99774266e-01 9.98000000e-01 5.00000000e+00 3.00000000e+00
 9.00000000e+00 6.44592000e-05]



kkrunchy
Depth - #Rules

## Does the features learned in the synthetic data work well in the real-world data?

In [12]:
filesTest = ['2019-09', '2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03', '2020-04', '2020-05']

### All 119 features

In [13]:
dfTr = pd.read_csv('../dataPrivate/2019-08.Merged.csv', sep =',', header=None, na_values=-1, index_col=0)

idxlabel = dfTr.shape[1] - 1
Xtrain = dfTr.iloc[:,:idxlabel]
Ytrain = dfTr.iloc[:,idxlabel]

imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(Xtrain)
Xtrain = imp.transform(Xtrain)

clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=200)
clf.fit(Xtrain, Ytrain)
attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count)

accs = []
baccs = []
for file in filesTest:
    dfTs = pd.read_csv('../dataPrivate/'+file+'.Merged.csv', sep =',', header=None, na_values=-1, index_col=0)
    Xtest = dfTs.iloc[:,:idxlabel]
    Ytest = dfTs.iloc[:,idxlabel]
    Xtest = imp.transform(Xtest)
    Ypred = clf.predict(Xtest)
    acc = accuracy_score(Ytest, Ypred)
    bacc = balanced_accuracy_score(Ytest, Ypred)   
    print(acc, bacc)
    accs.append(acc)
    baccs.append(bacc)
    predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
    predictions.to_csv('../ResultsOther/predictions/'+file+'.Merged.DT.csv', sep=',', header=True, index=False)
    
print('Accuracy:', np.mean(accs), np.std(accs))
print('B.Accuracy:', np.mean(baccs), np.std(accs))

27 166 76 331
0.9932846412129184 0.9264271054219063
0.9893494943699347 0.9109608109151014
0.9888835380046552 0.886370816981576
0.9895093188497451 0.9103015478755925
0.9940901017277571 0.9085278190348401
0.9906548501672472 0.8890991809697197




0.9935810810810811 0.9577304604157385
0.9846595077702928 0.8387680584732022
0.9869930319814186 0.8029350265249295
Accuracy: 0.9901117294627834 0.0029838741802020053
B.Accuracy: 0.8923467585125119 0.0029838741802020053


### With the selected features

In [14]:
for alg in algsFS:
    print(alg)
    dfTr = pd.read_csv('../dataPrivate/2019-08.Merged'+alg+'.csv', sep =',', header=None, na_values=-1, index_col=0)

    idxlabel = dfTr.shape[1] - 1
    Xtrain = dfTr.iloc[:,:idxlabel]
    Ytrain = dfTr.iloc[:,idxlabel]

    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(Xtrain)
    Xtrain = imp.transform(Xtrain)

    clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=200)
    clf.fit(Xtrain, Ytrain)
    attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
    print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count)

    accs = []
    baccs = []
    for file in filesTest:
        dfTs = pd.read_csv('../dataPrivate/'+file+'.Merged'+alg+'.csv', sep =',', header=None, na_values=-1, index_col=0)
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]
        Xtest = imp.transform(Xtest)
        Ypred = clf.predict(Xtest)
        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)   
        print(acc, bacc)
        accs.append(acc)
        baccs.append(bacc)
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/'+file+'.Merged'+alg+'.DT.csv', sep=',', header=True, index=False)
    
    print('Accuracy:', np.mean(accs), np.std(accs))
    print('B.Accuracy:', np.mean(baccs), np.std(accs))
    print('\n')

FSbAC
28 200 25 399
0.9947730970265499 0.9002874055438097
0.989959119271319 0.8718303752617889
0.9930973593386307 0.8273586889538975
0.9965775082772218 0.8458936029785923
0.9974487324398514 0.8805858818870369
0.9915514328080278 0.8470101258282577
0.9950675675675675 0.9075941666262412
0.9896284932968719 0.7822928322461965
0.9882437019832053 0.7310510010833418
Accuracy: 0.992927445778805 0.003078696100303556
B.Accuracy: 0.8437671200454625 0.003078696100303556


KBestChi2
26 200 29 399
0.9890269652809027 0.9007845517908019
0.9805995840206555 0.8558457354718415
0.9823019503973032 0.8433891904286783
0.98582642014806 0.8217632354969711
0.9912158889068303 0.874982437163946
0.9942756646780924 0.883504298031402
0.9925 0.7780321193082733
0.9864603481624759 0.8587262353145658




0.9850991602644273 0.739807681092707
Accuracy: 0.9874784424287496 0.004363053719932012
B.Accuracy: 0.8396483871221319 0.004363053719932012


KBestMI
19 200 29 399
0.9935615632247569 0.9055658136737319
0.9921824571469554 0.893325083631262
0.9951842041897424 0.9061268845544767
0.9968007142591422 0.8691995390029045
0.9968997254965284 0.8908657078268449
0.994551536259871 0.8993869908759545
0.9970270270270271 0.9013407205389056
0.995164410058027 0.8936846763580848




0.9911381097016259 0.8781845737736932
Accuracy: 0.9947233052626306 0.001984810483901444
B.Accuracy: 0.8930755544706508 0.001984810483901444


RFE
18 200 27 399
0.9937692547336356 0.9407159271059565
0.9874847593774654 0.8960471684368174
0.9946624929769644 0.8682931031739699
0.9958334883374874 0.8487795079886604
0.9975456160180849 0.8872703198148925
0.994758439946205 0.90739697187756
0.9966891891891891 0.8637868712869206
0.9939638497965717 0.831178433468167




0.9909951759871359 0.8065079048321719
Accuracy: 0.9939669184847487 0.0029002100972992054
B.Accuracy: 0.8722195786650129 0.0029002100972992054


SFSbackward
24 200 28 399
0.9901346533282565 0.9281375324083877
0.9883095460087499 0.910900954483683
0.9849506380929448 0.8912431574052792
0.9877608719913694 0.8524027582362533
0.9916034232197642 0.8771835317377183
0.9944480844167041 0.9249730547847409
0.9907432432432433 0.9347822238500653
0.9716867871673448 0.8418846368292081




0.9678399142397713 0.7668987136736285
Accuracy: 0.9852752401897943 0.008703941095968644
B.Accuracy: 0.880934062600996 0.008703941095968644


SFSforward
17 200 29 399
0.9913115718785697 0.8975165727557096
0.9828587821846088 0.8487444008989503
0.9910105144875191 0.834876127447739
0.9787210297235966 0.8260543202181575
0.9946391086710803 0.8662977309999705
0.9917583364943618 0.8367186815127848
0.9879054054054054 0.892509009392302
0.9820249449743214 0.7229414535660791
0.944184384491692 0.6470209967466081
Accuracy: 0.9827126753679062 0.014509307932558375
B.Accuracy: 0.8191865881709224 0.014509307932558375




### Without the *Entry Bytes EB* attributes

In [15]:
dfTr = pd.read_csv('../dataPrivate/2019-08.MergedWoEB.csv', sep =',', header=None, na_values=-1, index_col=0)

idxlabel = dfTr.shape[1] - 1
Xtrain = dfTr.iloc[:,:idxlabel]
Ytrain = dfTr.iloc[:,idxlabel]

imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(Xtrain)
Xtrain = imp.transform(Xtrain)

clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=200)
clf.fit(Xtrain, Ytrain)
attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count)

accs = []
baccs = []
for file in filesTest:
    dfTs = pd.read_csv('../dataPrivate/'+file+'.MergedWoEB.csv', sep =',', header=None, na_values=-1, index_col=0)
    Xtest = dfTs.iloc[:,:idxlabel]
    Ytest = dfTs.iloc[:,idxlabel]
    Xtest = imp.transform(Xtest)
    Ypred = clf.predict(Xtest)
    acc = accuracy_score(Ytest, Ypred)
    bacc = balanced_accuracy_score(Ytest, Ypred)   
    print(acc, bacc)
    accs.append(acc)
    baccs.append(bacc)
    predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
    predictions.to_csv('../ResultsOther/predictions/'+file+'.MergedWoEB.DT.csv', sep=',', header=True, index=False)

print('Accuracy:', np.mean(accs), np.std(accs))
print('B.Accuracy:', np.mean(baccs), np.std(accs))

16 200 39 399
0.9879885077365087 0.878035765061791
0.9782328049917521 0.8103032698506084
0.97869010353961 0.7907247911212975
0.9665563037089394 0.7852944448425406
0.9806232843533021 0.7900015828428646
0.9898617193696334 0.8524789212893936
0.9796283783783784 0.8331217667848777
0.9734542786633762 0.7366551483013202
0.971270323387529 0.6796973201934031
Accuracy: 0.9784784115698921 0.007034713664836186
B.Accuracy: 0.7951458900320107 0.007034713664836186




### Without the *Entry Bytes EB* attributes + Feature Selection

In [16]:
for alg in algsFS:
    print(alg)
    dfTr = pd.read_csv('../dataPrivate/2019-08.MergedWoEB'+alg+'.csv', sep =',', header=None, na_values=-1, index_col=0)

    idxlabel = dfTr.shape[1] - 1
    Xtrain = dfTr.iloc[:,:idxlabel]
    Ytrain = dfTr.iloc[:,idxlabel]

    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(Xtrain)
    Xtrain = imp.transform(Xtrain)

    clf = DecisionTreeClassifier(random_state=0, max_depth=None, max_leaf_nodes=200)
    clf.fit(Xtrain, Ytrain)
    attrs = np.unique(clf.tree_.feature[clf.tree_.feature > -2])
    print(clf.get_depth(), clf.get_n_leaves(), len(attrs), clf.tree_.node_count)

    accs = []
    baccs = []
    for file in filesTest:
        dfTs = pd.read_csv('../dataPrivate/'+file+'.MergedWoEB'+alg+'.csv', sep =',', header=None, na_values=-1, index_col=0)
        Xtest = dfTs.iloc[:,:idxlabel]
        Ytest = dfTs.iloc[:,idxlabel]
        Xtest = imp.transform(Xtest)
        Ypred = clf.predict(Xtest)
        acc = accuracy_score(Ytest, Ypred)
        bacc = balanced_accuracy_score(Ytest, Ypred)   
        print(acc, bacc)
        accs.append(acc)
        baccs.append(bacc)
        predictions = pd.DataFrame(data=list(zip(Ytest, Ypred)), columns=["True", "Predicted"])
        predictions.to_csv('../ResultsOther/predictions/'+file+'.MergedWoEB'+alg+'.DT.csv', sep=',', header=True, index=False)
        
    print('Accuracy:', np.mean(accs), np.std(accs))
    print('B.Accuracy:', np.mean(baccs), np.std(accs))
    print('\n')

FSbAC
21 200 30 399
0.9778462390529267 0.8705399889582929
0.9729254823208778 0.8127198784372877
0.973513123043583 0.7686193790315851
0.9514898999293181 0.7798326547342346
0.9675117067657032 0.8110977866063406
0.9832752853546673 0.8308072139234055
0.9740878378378378 0.8515745296754766
0.9644834255986127 0.7388589322473645
0.9701983205288548 0.6348657369549214




Accuracy: 0.9705923689369312 0.008508406238204407
B.Accuracy: 0.7887684556187676 0.008508406238204407


KBestChi2
24 200 31 399
0.9885423517601856 0.9113772663238601
0.9779100623969017 0.8618476789565009
0.9755197046311903 0.8316421899566785
0.962501395037387 0.8441535859152597
0.9831422573873728 0.8842653895799054
0.9878961343494603 0.9119782289460449
0.9824324324324324 0.9064044088027197
0.9747882345094377 0.807128830013066




0.9513310702161872 0.7141782225125451
Accuracy: 0.9760070714133948 0.011485894769473577
B.Accuracy: 0.8525528667785089 0.011485894769473577


KBestMI
16 200 30 399
0.9776385475440479 0.8821459479126403
0.9727461808792943 0.812307008405444
0.9743157556786259 0.7785982019815775
0.9536847587515346 0.7744219512670732
0.9681898918133376 0.7847717535685292
0.9787234042553191 0.8120076606889394
0.9736824324324325 0.802453814874069
0.9645834722870673 0.6961541004348791




0.9719135251027337 0.6303331410156711
Accuracy: 0.9706086631938214 0.007251838517678866
B.Accuracy: 0.7747992866832025 0.007251838517678866


RFE
19 200 32 399
0.9875038942157914 0.8804128006132145
0.9780893638384852 0.8139218937149971
0.9784894453808491 0.7945397459007967
0.9659238867601652 0.7922396987280751
0.9866300662037785 0.821281178397068
0.9888616848856857 0.8348262697187755
0.9821621621621621 0.8522562006775382
0.9765557260054692 0.7369767250265552




0.9486510630695015 0.6340640591450022
Accuracy: 0.9769852547246543 0.012020170772411975
B.Accuracy: 0.7956131746580025 0.012020170772411975


SFSbackward
19 200 30 399
0.9760116307244973 0.9098336988407478
0.9679050419565374 0.8421865876385174
0.9742756240468737 0.8234005979409952
0.9494438450950485 0.8099832909457713
0.968125302761182 0.8814888350339801
0.9792406634711541 0.8951301841924292
0.9723986486486487 0.8614742938594042
0.9656172880677649 0.7762981570355014
0.944041450777202 0.6561389290975725
Accuracy: 0.9663399439498789 0.011296573886337246
B.Accuracy: 0.8284371749538799 0.011296573886337246


SFSforward
21 200 29 399
0.9786077745854823 0.9197606243307108
0.9636376676468479 0.854815415238683
0.9749578617866602 0.8492943295618002
0.9399947918604219 0.8200289273613889
0.9483610528015501 0.8803888702541137
0.9415497086106418 0.8633359328228948
0.9639189189189189 0.8940064706352768
0.9383712399119589 0.7725706812381965
0.938038234768626 0.6701266049708342
Accuracy: 0.95415969454