In [1]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from itertools import chain, combinations, permutations
from skmultilearn.problem_transform import LabelPowerset
from sklearn.dummy import DummyClassifier

from collections import OrderedDict

In [2]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)

In [3]:
df_multilabel_strict = pd.read_csv('../data/multilabel_raw_data_strict.csv')
df_multiclass = pd.read_csv('../data/multiclass_target_raw_data.csv')
H_Best = ['H0_Best', 'H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']

y_strict = df_multilabel_strict[H_Best] 
y_best = df_multiclass['Best Heuristic']


X_strict = df_multilabel_strict.drop(H_Best, axis=1)

X_orig = df_multiclass.drop(H_Best + ['Best Heuristic'], axis=1)
X_orig.head()

Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,D30,D31,D32,D33,D34,D35,D36,D37,D38,D39
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797


In [4]:
y_strict

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,1,0,1,1
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
6113,1,0,0,0,0,0
6114,1,0,0,0,0,0
6115,1,0,0,0,0,0
6116,1,0,0,0,0,0


In [5]:
np.array(y_strict)

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1],
       [1, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [6]:
def get_unique_combos(X, y):
    lp = LabelPowerset(DummyClassifier(strategy='stratified'))
    lp.fit(X, y)
    return lp.unique_combinations_

def print_mapping(dictionary):
    for key in dictionary.keys():
        print('{} : {}'.format(key, dictionary[key]))
        
def str_keys_to_list(string):
    return [int(num) for num in string.split(',')]

In [7]:
unique_strict = get_unique_combos(X_strict, y_strict)

In [8]:
def get_powerset_counts(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(6,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts


In [9]:
arr_y_strict = np.array(y_strict)
counts_strict = get_powerset_counts(arr_y_strict, unique_strict)

sorted_counts_strict = OrderedDict(sorted(counts_strict.items(), key= lambda k: k[1]))
counts_strict_df = pd.DataFrame(sorted_counts_strict.values(), index=sorted_counts_strict.keys(), columns=['strict'])
print(counts_strict_df)

           strict
1,2,3,5         3
1,2,5           5
1,2,3,4         7
1,3,4           7
2,3,4           9
1,3,5          10
1,2,3          12
3,4            12
2,3,5          13
2,4,5          15
2,5            16
1,2,4          26
2,3,4,5        26
2,3            29
1,2            31
3,4,5          31
1,5            31
2,4            31
1,3            34
1,2,4,5        38
1,4,5          63
1,3,4,5        69
1,4            75
1,2,3,4,5      84
3,5            85
4,5           141
2             347
4             476
1             594
3             620
5             624
0            2554


In [10]:
print_mapping(unique_strict)

0 : 0
1,2,4,5 : 1
1,2,4 : 2
3 : 3
4,5 : 4
2,3 : 5
5 : 6
1 : 7
1,2,3,4,5 : 8
1,4 : 9
2 : 10
1,3,4,5 : 11
1,2,3 : 12
2,3,4,5 : 13
4 : 14
1,2 : 15
3,5 : 16
2,4,5 : 17
2,5 : 18
2,3,4 : 19
1,3 : 20
3,4,5 : 21
1,5 : 22
1,4,5 : 23
1,3,5 : 24
3,4 : 25
1,2,3,4 : 26
1,3,4 : 27
2,3,5 : 28
2,4 : 29
1,2,5 : 30
1,2,3,5 : 31


In [11]:
np.array(y_strict)
print(np.array(y_strict))

[[1 0 0 0 0 0]
 [0 1 1 0 1 1]
 [1 0 0 0 0 0]
 ...
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]]


In [12]:
def modify_target_labels(target, powerset):
    N = len(target)
    target_newlabels = np.zeros(shape=(N,8), dtype=int)
    count = 0
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        int_list_copy = str_keys_to_list(combo)
        if 1 in int_list and 2 in int_list:
            int_list_copy.remove(1)
            int_list_copy.remove(2)
            int_list_copy.append(6)
        if 4 in int_list and 5 in int_list:
            int_list_copy.remove(4)
            int_list_copy.remove(5)
            int_list_copy.append(7)
        pattern = np.zeros(6,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(N):
            if np.array_equal(target[a], pattern):
                for j in int_list_copy:
                    target_newlabels[a][j] = 1
                count += 1
                
    print(count)
    return target_newlabels


In [13]:
target_newlabels = modify_target_labels(np.array(y_strict), unique_strict)

6118


In [14]:
target_newlabels = modify_target_labels(y_strict.to_numpy(), unique_strict)

6118


In [15]:
target_newlabels

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [16]:
target_new = pd.DataFrame(target_newlabels)

In [17]:
target_new

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
6113,1,0,0,0,0,0,0,0
6114,1,0,0,0,0,0,0,0
6115,1,0,0,0,0,0,0,0
6116,1,0,0,0,0,0,0,0


In [18]:
def get_unique_combos(X, y):
    lp = LabelPowerset(DummyClassifier(strategy='stratified'))
    lp.fit(X, y)
    return lp.unique_combinations_

In [19]:
unique_new = get_unique_combos(X_strict, target_new)

In [20]:
print_mapping(unique_new)

0 : 0
6,7 : 1
4,6 : 2
3 : 3
7 : 4
2,3 : 5
5 : 6
1 : 7
3,6,7 : 8
1,4 : 9
2 : 10
1,3,7 : 11
3,6 : 12
2,3,7 : 13
4 : 14
6 : 15
3,5 : 16
2,7 : 17
2,5 : 18
2,3,4 : 19
1,3 : 20
3,7 : 21
1,5 : 22
1,7 : 23
1,3,5 : 24
3,4 : 25
3,4,6 : 26
1,3,4 : 27
2,3,5 : 28
2,4 : 29
5,6 : 30
3,5,6 : 31


In [21]:
def get_powerset_counts_new(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(8,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts

In [22]:
counts_new = get_powerset_counts_new(np.array(target_new), unique_new)
sorted_counts_new = OrderedDict(sorted(counts_new.items(), key= lambda k: k[1]))

counts_new_df = pd.DataFrame(sorted_counts_new.values(),index=sorted_counts_new.keys(), columns=['strict_new'])
counts_new_df

Unnamed: 0,strict_new
356,3
56,5
346,7
134,7
234,9
135,10
36,12
34,12
235,13
27,15


In [23]:
def modify_target_labels_round2(target, powerset):
    N = len(target)
    target_newlabels = np.zeros(shape=(N,9), dtype=int)
    count = 0
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        int_list_copy = str_keys_to_list(combo)
        if 3 in int_list and 7 in int_list:
            int_list_copy.remove(3)
            int_list_copy.remove(7)
            int_list_copy.append(8)
        pattern = np.zeros(8,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(N):
            if np.array_equal(target[a], pattern):
                for j in int_list_copy:
                    target_newlabels[a][j] = 1
                count += 1
                
    print(count)
    return target_newlabels


In [24]:
target_new2 = modify_target_labels_round2(target_newlabels, unique_new)
target_new2_df = pd.DataFrame(target_new2)

6118


In [25]:
unique_new2 = get_unique_combos(X_strict, target_new2_df)

In [26]:
def get_powerset_counts_new(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(9,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts

In [27]:
counts_new2 = get_powerset_counts_new(target_new2, unique_new2)
sorted_counts_new2 = OrderedDict(sorted(counts_new2.items(), key= lambda k: k[1]))



In [28]:
sorted_counts_new2

OrderedDict([('3,5,6', 3),
             ('5,6', 5),
             ('3,4,6', 7),
             ('1,3,4', 7),
             ('2,3,4', 9),
             ('1,3,5', 10),
             ('3,6', 12),
             ('3,4', 12),
             ('2,3,5', 13),
             ('2,7', 15),
             ('2,5', 16),
             ('4,6', 26),
             ('2,8', 26),
             ('2,3', 29),
             ('6', 31),
             ('8', 31),
             ('1,5', 31),
             ('2,4', 31),
             ('1,3', 34),
             ('6,7', 38),
             ('1,7', 63),
             ('1,8', 69),
             ('1,4', 75),
             ('6,8', 84),
             ('3,5', 85),
             ('7', 141),
             ('2', 347),
             ('4', 476),
             ('1', 594),
             ('3', 620),
             ('5', 624),
             ('0', 2554)])

In [29]:
counts_new_df2 = pd.DataFrame(sorted_counts_new2.values(),index=sorted_counts_new2.keys(), columns=['strict_new'])
counts_new_df2

Unnamed: 0,strict_new
356,3
56,5
346,7
134,7
234,9
135,10
36,12
34,12
235,13
27,15


In [30]:
df_merged = X_strict.merge(target_new2_df, left_index=True, right_index=True)

In [31]:
df_merged.head()

Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,D39,0,1,2,3,4,5,6,7,8
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.18797,1,0,0,0,0,0,0,0,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.18797,0,0,0,0,0,0,1,1,0
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.18797,1,0,0,0,0,0,0,0,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.18797,1,0,0,0,0,0,0,0,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.18797,1,0,0,0,0,0,0,0,0


In [32]:
df_merged.to_csv('../data/raw_data_modified_classes.csv', index=None)