In [1]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from itertools import chain, combinations, permutations
from skmultilearn.problem_transform import LabelPowerset
from sklearn.dummy import DummyClassifier

from collections import OrderedDict
import time
t_start = time.time()

In [2]:
features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)

In [3]:
df_o7jjtilabel = pd.read_csv('../data/multilabel_raw_data_tol.csv')
df_multiclass = pd.read_csv('../data/multiclass_target_raw_data.csv')
H_Best = ['H0_Best', 'H1_Best', 'H2_Best', 'H3_Best', 'H4_Best', 'H5_Best']

y_tol = df_multilabel[H_Best] 
y_best = df_multiclass['Best Heuristic']


X_tol = df_multilabel.drop(H_Best, axis=1)

X_orig = df_multiclass.drop(H_Best + ['Best Heuristic'], axis=1)
X_orig.head()

Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,D30,D31,D32,D33,D34,D35,D36,D37,D38,D39
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.0,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797


In [4]:
y_tol

Unnamed: 0,H0_Best,H1_Best,H2_Best,H3_Best,H4_Best,H5_Best
0,1,0,0,0,0,0
1,0,1,1,0,1,1
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
6113,1,0,0,0,0,0
6114,1,0,0,0,0,0
6115,1,0,0,0,0,0
6116,1,0,0,0,0,0


In [5]:
np.array(y_tol)

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1],
       [1, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [7]:
unique_tol = get_unique_combos(X_tol, y_tol)

In [8]:
def get_powerset_counts(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(6,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts


In [9]:
arr_y_tol = np.array(y_tol)
counts_tol = get_powerset_counts(arr_y_tol, unique_tol)

sorted_counts_tol = OrderedDict(sorted(counts_tol.items(), key= lambda k: k[1]))
counts_tol_df = pd.DataFrame(sorted_counts_tol.values(), index=sorted_counts_tol.keys(), columns=['tolerance'])
print(counts_tol_df)

           tolerance
1,2,3,5            7
1,2,5              7
2,5                9
2,3,4             10
3,4               10
1,3,4             12
2,3,5             15
2,4,5             15
1,5               21
1,2,3             22
2,4               25
1,2,3,4           29
1,3,5             35
3,4,5             39
2,3,4,5           41
1,2               45
2,3               48
1,3               54
1,2,4,5           68
1,2,4             76
1,4               85
3,5              103
1,4,5            117
4,5              133
1,3,4,5          145
2                242
4                360
1,2,3,4,5        382
5                421
1                463
3                525
0               2554


In [10]:
print_mapping(unique_tol)

0 : 0
1,2,4,5 : 1
3 : 2
4,5 : 3
2,3 : 4
5 : 5
1,3 : 6
1,2,3,4,5 : 7
2 : 8
2,3,5 : 9
1,2,3 : 10
3,5 : 11
2,3,4,5 : 12
1,3,4,5 : 13
1 : 14
4 : 15
1,4,5 : 16
1,3,5 : 17
2,4,5 : 18
3,4,5 : 19
1,4 : 20
1,5 : 21
1,2 : 22
2,4 : 23
1,2,4 : 24
1,3,4 : 25
1,2,3,4 : 26
2,5 : 27
2,3,4 : 28
3,4 : 29
1,2,3,5 : 30
1,2,5 : 31


In [11]:
np.array(y_tol)
print(np.array(y_tol))

[[1 0 0 0 0 0]
 [0 1 1 0 1 1]
 [1 0 0 0 0 0]
 ...
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]]


In [12]:
def modify_target_labels(target, powerset):
    N = len(target)
    target_labels_new = np.zeros(shape=(N,8), dtype=int)
    count = 0
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        int_list_copy = str_keys_to_list(combo)
        if 1 in int_list and 2 in int_list:
            int_list_copy.remove(1)
            int_list_copy.remove(2)
            int_list_copy.append(6)
        if 4 in int_list and 5 in int_list:
            int_list_copy.remove(4)
            int_list_copy.remove(5)
            int_list_copy.append(7)
        pattern = np.zeros(6,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(N):
            if np.array_equal(target[a], pattern):
                for j in int_list_copy:
                    target_labels_new[a][j] = 1
                count += 1
                
    print(count)
    return target_labels_new


In [13]:
target_labels_new = modify_target_labels(np.array(y_tol), unique_tol)

6118


In [14]:
target_labels_new

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [15]:
col_labels = ['Class ' + str(i) for i in range(8)]
col_labels

['Class 0',
 'Class 1',
 'Class 2',
 'Class 3',
 'Class 4',
 'Class 5',
 'Class 6',
 'Class 7']

In [16]:
target_new = pd.DataFrame(target_labels_new, columns=col_labels )

In [17]:
target_new

Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Class 5,Class 6,Class 7
0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
6113,1,0,0,0,0,0,0,0
6114,1,0,0,0,0,0,0,0
6115,1,0,0,0,0,0,0,0
6116,1,0,0,0,0,0,0,0


In [18]:
def get_unique_combos(X, y):
    lp = LabelPowerset(DummyClassifier(strategy='stratified'))
    lp.fit(X, y)
    return lp.unique_combinations_

In [19]:
unique_new = get_unique_combos(X_tol, target_new)

In [20]:
print_mapping(unique_new)

0 : 0
6,7 : 1
3 : 2
7 : 3
2,3 : 4
5 : 5
1,3 : 6
3,6,7 : 7
2 : 8
2,3,5 : 9
3,6 : 10
3,5 : 11
2,3,7 : 12
1,3,7 : 13
1 : 14
4 : 15
1,7 : 16
1,3,5 : 17
2,7 : 18
3,7 : 19
1,4 : 20
1,5 : 21
6 : 22
2,4 : 23
4,6 : 24
1,3,4 : 25
3,4,6 : 26
2,5 : 27
2,3,4 : 28
3,4 : 29
3,5,6 : 30
5,6 : 31


In [21]:
def get_powerset_counts_new(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(8,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts

In [22]:
counts_new = get_powerset_counts_new(np.array(target_new), unique_new)
sorted_counts_new = OrderedDict(sorted(counts_new.items(), key= lambda k: k[1]))

counts_new_df = pd.DataFrame(sorted_counts_new.values(),index=sorted_counts_new.keys(), columns=['tol_new'])
counts_new_df

Unnamed: 0,tol_new
356,7
56,7
25,9
234,10
34,10
134,12
235,15
27,15
15,21
36,22


In [23]:
df_merged = X_tol.merge(target_new,  left_index=True, right_index=True)
df_merged


Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,D38,D39,Class 0,Class 1,Class 2,Class 3,Class 4,Class 5,Class 6,Class 7
0,0.833070,0.99682,0.833070,0.767890,0,0.769480,0.069952,0.16057,6,1.2734,...,0.073308,0.18797,1,0,0,0,0,0,0,0
1,0.833070,0.99682,0.833070,0.769480,0,0.771070,0.068363,0.16057,6,1.2734,...,0.067669,0.18797,0,0,0,0,0,0,1,1
2,0.833070,0.99682,0.833070,0.767890,0,0.769480,0.069952,0.16057,6,1.2734,...,0.069549,0.18797,1,0,0,0,0,0,0,0
3,0.833070,0.99682,0.833070,0.767890,0,0.769480,0.069952,0.16057,6,1.2734,...,0.080827,0.18797,1,0,0,0,0,0,0,0
4,0.833070,0.99682,0.833070,0.767890,0,0.769480,0.069952,0.16057,6,1.2734,...,0.078947,0.18797,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6113,0.008403,0.77311,0.058824,0.008403,0,0.025210,0.042017,0.93277,8,3.1597,...,0.008772,0.90351,1,0,0,0,0,0,0,0
6114,0.017391,0.77391,0.052174,0.017391,0,0.043478,0.043478,0.91304,8,3.0783,...,0.000000,0.90826,1,0,0,0,0,0,0,0
6115,0.026786,0.79464,0.026786,0.017857,0,0.035714,0.044643,0.91964,8,3.0268,...,0.000000,0.91304,1,0,0,0,0,0,0,0
6116,0.017857,0.79464,0.017857,0.008929,0,0.026786,0.044643,0.92857,8,3.0357,...,0.000000,0.91379,1,0,0,0,0,0,0,0


In [24]:
df_merged.to_csv('../data/multilabel_modified_classes.csv', index=None)

In [25]:
not_using = """def modify_target_labels_round2(target, powerset):
    N = len(target)
    target_labels_new = np.zeros(shape=(N,9), dtype=int)
    count = 0
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        int_list_copy = str_keys_to_list(combo)
        if 3 in int_list and 7 in int_list:
            int_list_copy.remove(3)
            int_list_copy.remove(7)
            int_list_copy.append(8)
        pattern = np.zeros(8,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(N):
            if np.array_equal(target[a], pattern):
                for j in int_list_copy:
                    target_labels_new[a][j] = 1
                count += 1
                
    print(count)
    return target_labels_new

col_labels = ['Class ' + str(i) for i in range(9)]
print(col_labels)

target_new2 = modify_target_labels_round2(target_labels_new, unique_new)
target_new2_df = pd.DataFrame(target_new2, columns=col_labels)
print(target_new2_df.head())
unique_new2 = get_unique_combos(X_tol, target_new2_df)

def get_powerset_counts_new(target, powerset):
    store_counts = {combo: 0 for combo in powerset.keys()}
    for combo in powerset.keys():
        int_list = str_keys_to_list(combo)
        pattern = np.zeros(9,dtype=int)
        for i in int_list:
            pattern[i] = 1
        for a in range(len(target)):
            if np.array_equal(target[a], pattern):
                store_counts[combo] += 1
    return store_counts

counts_new2 = get_powerset_counts_new(target_new2, unique_new2)
sorted_counts_new2 = OrderedDict(sorted(counts_new2.items(), key= lambda k: k[1]))

print(sorted_counts_new2)

counts_new_df2 = pd.DataFrame(sorted_counts_new2.values(),index=sorted_counts_new2.keys(), columns=['tol_new'])
counts_new_df2

df_merged = X_tol.merge(target_new2_df, left_index=True, right_index=True)
print(df_merged.head())

df_merged.to_csv('../data/multilabel_modified_classes.csv', index=None)"""