In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from skmultilearn.adapt import MLkNN
from skmultilearn.model_selection import IterativeStratification
from skmultilearn.dataset import available_data_sets
from skmultilearn.dataset import load_dataset

import time

import warnings
warnings.filterwarnings('ignore')

#  Particle.py

In [2]:
class Particle:
    
    def __init__(self, length, pos_max, pos_min, vel_max, vel_min, w, c1, c2, problem):
        self.length = length
        self.pos_max = pos_max
        self.pos_min = pos_min
        self.vel_max = vel_max
        self.vel_min = vel_min
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.problem = problem

        self.position = np.random.rand(length)*(pos_max-pos_min)
        self.velocity = np.zeros(length)
        self.fitness = self.problem.worst_fitness()

        self.pbest_pos = np.zeros(length)
        self.pbest_fit = self.problem.worst_fitness()

        self.gbest_pos = np.zeros(length)
        self.gbest_fit = self.problem.worst_fitness()

    def update(self):
        # Update velocity
        self.velocity = self.w * self.velocity + \
            self.c1 * np.random.rand(self.length) * (self.pbest_pos - self.position) + \
            self.c2 * np.random.rand(self.length) * \
            (self.gbest_pos - self.position)

        self.velocity[self.velocity < self.vel_min] = self.vel_min
        self.velocity[self.velocity > self.vel_max] = self.vel_max

        # update position
        self.position = self.position + self.velocity
        self.position[self.position < self.pos_min] = self.pos_min
        self.position[self.position > self.pos_max] = self.pos_max

In [3]:
class Swarm:

    def __init__(self, n_particles, length, pos_max, pos_min, vel_max, vel_min, problem, n_iterations):
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.problem = problem

        w = 0.8
        c1 = 1.46
        c2 = 1.46
        self.population = [Particle(length = length, 
                                    pos_max = pos_max, pos_min = pos_min, 
                                    vel_max = vel_max, vel_min = vel_min, 
                                    w = w, c1 = c1, c2 = c2, problem = problem)
                           for _ in range(n_particles)]

    def iterate(self):

        for i in range(self.n_iterations):
            
            print('Iterate ', i, end = '  ')
            gbest_fit = self.population[0].gbest_fit
            gbest_index = 0
            gbest_updated = False
            print('gbest value is ', gbest_fit)
            
            for index, particle in enumerate(self.population):
                # Evaluate each particle, update pbest
                particle.fitness = self.problem.fitness(particle.position)

                if self.problem.is_better(particle.fitness, particle.pbest_fit):
                    particle.pbest_fit = particle.fitness
                    particle.pbest_pos = np.copy(particle.position)

                if self.problem.is_better(particle.pbest_fit, gbest_fit):
                    gbest_fit = particle.pbest_fit
                    gbest_index = index
                    gbest_updated = True

            if gbest_updated:
                for particle in self.population:
                    particle.gbest_fit = self.population[gbest_index].pbest_fit
                    particle.gbest_pos = np.copy(
                        self.population[gbest_index].pbest_pos)

            # now update particle position:
            for particle in self.population:
                particle.update()

        return self.population[0].gbest_pos, self.population[0].gbest_fit

#  Problem.py

In [4]:
class Problem:
    def __init__(self, minimize):
        self.minimize = minimize

    def fitness(self, solution):
        return 1
    
    def is_better(self, first, second):
        if self.minimize:
            return first < second
        else:
            return first > second

    def worst_fitness(self):
        if self.minimize:
            return float('inf')
        else:
            return float('-inf')

In [5]:
class FS(Problem):

    def __init__(self, minimize, X, y):
        self.minimize = minimize,
        self.X = X
        self.y = y
        self.threshold = 0.6

    def fitness(self, solution):
        feature_selected = np.where(solution > self.threshold)[0]
        X = self.X[:, feature_selected]
        y = self.y
        if len(feature_selected) == 0:
            return self.worst_fitness()
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

        clf = KNN()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        return acc

In [6]:
# Standard PSO
# Fitness is MLKNN classification hamming loss.

class FS_ML(Problem):
    
    def __init__(self, minimize, X, y):
        self.minimize = minimize,
        self.X = X
        self.y = y
        self.threshold = 0.6

    def fitness(self, solution):
        feature_selected = np.where(solution > self.threshold)[0]
        X = self.X[:, feature_selected]
        y = self.y
        if len(feature_selected) == 0:
            return self.worst_fitness()
        
        n_splits = 5
        clf = MLkNN(k=3)
        scaler = StandardScaler()
#         scaler = MinMaxScaler()
        k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=42)

        hamming_losses = 0
        for train_idx, test_idx in k_fold.split(X, y):
            
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
            
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            ham = hamming_loss(y_test, y_pred)
            hamming_losses += ham
        
        return hamming_losses/n_splits    

In [7]:
# Super label PSO
# Fitness is use super+sub classification hamming loss

class FS_ML_super(Problem):
    
    def __init__(self, minimize, X, y):
        self.minimize = minimize,
        self.X = X
        self.y = y
        self.threshold = 0.6

    def fitness(self, solution):
        feature_selected = np.where(solution > self.threshold)[0]
        X = self.X[:, feature_selected]
        y = self.y
        if len(feature_selected) == 0:
            return self.worst_fitness()
        
        n_splits = 5
        k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=42)
            
        hamming_losses = 0
        for train_idx, test_idx in k_fold.split(X, y):
            
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train_df, X_test_df, y_train_df, y_test_df = Convert_to_df(X_train, X_test, y_train, y_test)            
            y_test_pred, y_test = calc_preds(X_train_df, y_train_df, X_test_df, y_test_df)
            
        
            ham = hamming_loss(y_test, y_test_pred)
            hamming_losses += ham
        
        return hamming_losses/n_splits

# Super_labels.py

In [8]:
# In each subcgroup(cluster), get label indexes 

def ClusterIndicesNumpy(clustNum, labels_array): #numpy 
    return np.where(labels_array == clustNum)[0]

In [9]:
# Convert each subgroup's labels(binary nparray) into super label(list)
# If all labels are 0, super label is 0; otherwise, super label is assigned to 1

def convert(subgroup_label):
    super_ = []
    subgroup_label_array = subgroup_label.to_numpy()
    rows = subgroup_label.shape[0]
    columns = subgroup_label.shape[1]
    for row in range(rows):
        s = 0
        for column in range(columns):
            if subgroup_label_array[row][column] == 1:
                s = 1
                break
        super_.append(s)
    return super_

In [10]:
# Convert original y labels into y_super labels
# subgroup_labels are dataframe with original labels + super labels

def label_convert(y_train_, no_cls):  # Here y is dataframe
    
    subgroups = []
    super_labels = []
    kmeans = KMeans(n_clusters=no_cls, random_state=0).fit(y_train_.T)
    dict_clst_col = dict()  # dictionary to record key(cluster index) and value(cluster columns)
    
    for i in range(no_cls):
        cluster = ClusterIndicesNumpy(i, kmeans.labels_)  # Column numbers(indexes) of all the labels in each cluster
        dict_clst_col[i] = cluster
        subgroup_label = y_train_.iloc[:,cluster]   # Get all the original labels from cluster, dataframe form
        s = pd.DataFrame(convert(subgroup_label), columns = ['s'+ str(i)])   # Convert original labels to a column super label
        super_labels.append(s)
        subgroup_label['s' + str(i)] = s   # Concat s into subgroup
        subgroups.append(subgroup_label)        
        
    y_s = pd.concat(super_labels, axis=1)   # Combine all super label columns, as orginal y converted to super_label y, the target
    y_s = y_s.to_numpy()

    return y_s, subgroups, dict_clst_col

In [11]:
# After split, indexes of X_train, X_test and y_train, y_test will be discorder, aka not ascending any more.
# If index disorder, will be tricky to process index, e.g. zero_idx
# Need to reorder index first.

def convert_index(X_train, X_test, y_train, y_test):
    
    X_train_ = X_train.reset_index(drop=True)
    y_train_ = y_train.reset_index(drop=True)
    X_test_ = X_test.reset_index(drop=True)
    y_test_ = y_test.reset_index(drop=True)
    
    return X_train_, X_test_, y_train_, y_test_

When doing super and sub classification on training set, k-fold is not neccesary. 
Only the classifiers are required.

In [12]:
# Get trained super_classifier

def super_classifier(X_train_, y_train_, no_cls):
    
    clf = MLkNN(k=3)
    scaler = StandardScaler()
    X_train_ = scaler.fit_transform(X_train_)
#     X_train_ = MinMaxScaler().fit_transform(X_train_)
    y_s, subgroups, dict_clst_col = label_convert(y_train_, no_cls)
    clf.fit(X_train_, y_s)
    
    return clf

In [13]:
# For each subgroup, collect all the zero super labels
# The len of total_zeros is the number of subgroups, also the number of super labels

def zeros(y_s):

    total_zeros = []
    for i in range(y_s.shape[1]):     # number of super labels
        idx_zeros = []
        for j in range(y_s.shape[0]):   # number of instances
            if y_s[j][i] == 0:
                idx_zeros.append(j)
        total_zeros.append(idx_zeros)
        
    return total_zeros

In [14]:
# In each subgroup, if a particular row of y_s is zero, the corresponding X features also needs to be removed.
# For each subgroup, get the indexes of zeros in one y_s, and remove these same indexes from X feature instances.
# Each subgroup contains its own X, means different subgroup contain diffenrent number of instances
# Collect each removed X and return.

def remove_zeros(X, y_s):  # y_s is ndarray
    total_zeros = zeros(y_s)
    Xs = []
    for idx_zeros in total_zeros:
        X_ = pd.DataFrame(X).drop(idx_zeros)   
        Xs.append(X_)  
    return Xs

From original X and y, compute super label y(y_s), which actually comes from true y.
Each subgroup contains original y labels + y_s label.
Check each y_s, if 0, than remove the whole line, which means remove its corresponding original labels, and its X.
So the remaining of original labels, as well as X of each subgroup are different, since indexes of zeros in each y super label are different.

def sub_classification is for each subgroup, train X_(X remove y_s's zero indexes) and y_(y sub original labels remove y_s's zero indexes).
After training, collect all sub-clfs and Xs.

In [15]:
# Get trained sub classifiers

def sub_classifiers(X_train_, y_train_, no_cls):
    
    clfs = []
    y_s, subgroups, dict_clst_col = label_convert(y_train_, no_cls)   # y_super labels, converted from original y labels (target)
    total_zeros = zeros(y_s)
    scaler = StandardScaler() 
#     scaler = MinMaxScaler()
    Xs = remove_zeros(X_train_, y_s)
    
    for subgroup, idx_zeros, X in zip(subgroups,total_zeros, Xs):
                                                 # Have different X, because idx of zero are different
        y_ = subgroup.drop(idx_zeros)            # Drop all the zero instances, both in X and y, aka X_, y_ 
        y_ = y_.drop(y_.columns[-1:], axis = 1)  # Remove the s label
        
        X_ = scaler.fit_transform(X)
        clf = MLkNN(k=3)
#         clf.fit(X, y_.to_numpy())
        clf.fit(X_, y_.to_numpy())
        
        clfs.append(clf)

    return clfs 

Now already gained clf, which is classifier for super classification, and clfs which are for all the sub-classifications.
Then will apply clf and clfs on training set, to see the training_loss, and then apply on test set, to get test_loss.
Finally, compare.

1. Apply clf, clfs, Xs on training set.

In [16]:
# Do super classification on dataset

def super_classification(clf_super, X_test_):
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test_)
#     X_test_scaled = MinMaxScaler().fit_transform(X_test_)
    y_test_s_pred = clf_super.predict(X_test_scaled).toarray()   # Predicted super labels, will be passed into def zeros().
#     y_test_s_pred = clf_super.predict(X_test_).toarray()
    return y_test_s_pred

In [17]:
# Do sub-classification on sub-datasets (original X + subgroup original labels)

def sub_classification(clfs, X_test_, y_test_s_pred):

    total_test_zeros = zeros(y_test_s_pred)    # Based on predicted super label, compute which are zeros in each subgroup
    
    X_tests = remove_zeros(X_test_, y_test_s_pred)  # Remove zeros in each subgroup in X
    
    y_test_sub_preds = []
    scaler = StandardScaler() 
#     scaler = MinMaxScaler()
    for clf, X_test in zip(clfs, X_tests):
        X_scaled = scaler.fit_transform(X_test)
        y_test_sub_pred = clf.predict(X_scaled)
#         y_test_sub_pred = clf.predict(X_test_)
        y_test_sub_preds.append(y_test_sub_pred)
        
    return total_test_zeros, y_test_sub_preds    # total_test_zeros, y_test_sub_labels are lists

After super and sub-classifications are done on test/validation dataset, the next step is to revert and rebuild all the predicted sub-labels together. 
The predicted subgroups do not contain all the original instances, coz those all-zeros instances are removed before sub-classification. So when doing revert, we need to find out which instances are all-zeros(those predicted super-predicted are zero), these can be reverted to [0,0,0,...].
If the super-predicted label is not 0, then this predicted instance's subgroup labels are in coreesponding y_sub_pred. 

len(total_zeros) is number of subgroups, aka number of columns for super_y_labels
for each subgroup, y.shape[0] is the rows, aka instances in original y, 
if index of the instance is included in column in total_zeros, that means when revert to original labels, we can impute all the subgroup labels of this instance to all zeros

How to impute the zero super label's corresponding sub-labels to zeros? We need to know how many sub-labels in each cluster. That is how many labels in each y_sub_pred

In [18]:
# Fill out zeros back into each y_sub_pred

def fill_zeros(total_test_zeros, y_test_sub_preds, y_test_):
    total_filled_preds = []
    
    for i in range(len(total_test_zeros)):   # number of columns of y_super
        labels = []        
        y_test_sub_pred_np = y_test_sub_preds[i].toarray()
        no_sublabels = y_test_sub_pred_np.shape[1]
        for j in range(y_test_.shape[0]):   # original y_test's rows
            if j in total_test_zeros[i]:   
                labels.append(np.zeros(no_sublabels, dtype=np.int64))                                          
            else:
                label = y_test_sub_pred_np[0]             
                labels.append(label)
                y_test_sub_pred_np = np.delete(y_test_sub_pred_np, [0], axis = 0)
        total_filled_preds.append(labels)  
        
    return total_filled_preds

In [19]:
# sort the order of y_test_preds, aka revert to original order as y_test's columns(before clustering)
# Clusters are not neccessarily equal, so could convert total_filled_preds to np.array
# If cluster0 is 2, cluster is 4, will cause passed value index issuse.
# Reference to backup_0113 version.

def sort_preds(total_filled_preds, y_train_, no_cls):
    
    y_s, subgroup, dict_clst_col = label_convert(y_train_, no_cls)
    y_test_sub_preds = []
    for i in range(len(total_filled_preds)):
        np_total_filled_pred = np.array(total_filled_preds[i])
        y_test_sub_preds.append(pd.DataFrame(np_total_filled_pred, columns = dict_clst_col[i]))
    y_test_pred = pd.concat(y_test_sub_preds, axis = 1)
    y_test_pred_t = y_test_pred.T
    y_test_pred_t_sorted = y_test_pred_t.sort_index(ascending=True)
    y_test_pred_sorted = y_test_pred_t_sorted.T

    return y_test_pred_sorted

In [20]:
# Based on splited X_train, y_train, calculate the y_test_pred on x_test

def calc_preds(X_train, y_train, X_test, y_test, no_cls):
    
    X_train_, X_test_, y_train_, y_test_ = convert_index(X_train, X_test, y_train, y_test)
    
    clf_super = super_classifier(X_train_, y_train_, no_cls)
    clfs_sub = sub_classifiers(X_train_, y_train_, no_cls)
    y_test_s_pred = super_classification(clf_super, X_test_)
    total_test_zeros, y_test_sub_preds = sub_classification(clfs_sub, X_test_, y_test_s_pred)
    total_filled_preds = fill_zeros(total_test_zeros, y_test_sub_preds, y_test_)
    y_test_pred_sorted = sort_preds(total_filled_preds, y_train_, no_cls)
    
    return y_test_pred_sorted, y_test_

#  Util.py

In [21]:
# K-fold only accept X, y in numpy form, and return X_train, X_test, y_train, y_test in each loop(split).
# Convert X_train, X_test, y_train, y_test from numpy to dataframe, for super_label calculation call.

def Convert_to_df(X_train, X_test, y_train, y_test):
    
    X_train_df = pd.DataFrame(X_train)
    X_test_df = pd.DataFrame(X_test)
    y_train_df = pd.DataFrame(y_train)
    y_test_df = pd.DataFrame(y_test)
    
    return X_train_df, X_test_df, y_train_df, y_test_df

In [25]:
# # Experimental function.
# # To find out if cluster is not equal, what will happen.

# def get_not_three(X, y):
#     for i in range(5000):
#         X_train, X_test, y_train, y_test = train_test_split(X, y)
#         X_train_, X_test_, y_train_, y_test_ = convert_index(X_train, X_test, y_train, y_test)
#         kmeans = KMeans(n_clusters=2).fit(y_train_.T)
#         cluster = ClusterIndicesNumpy(0, kmeans.labels_)
#         if len(cluster) == 3:
#             print(i, cluster)
#             return X_train_, X_test_, y_train_, y_test_, kmeans
#     print('No cluster equals to one.')

In [36]:
# # Read arff file.

# def read_arff(file):
#     with open(file, encoding="utf-8") as f:
#         header = []
#         for line in f:
#             if line.startswith("@attribute"):
#                 header.append(line.split()[1])
#             elif line.startswith("@data"):
#                 break
#         df = pd.read_csv(f, header=None)
#         df.columns = header
#     return df

In [23]:
# pip install scipy

In [24]:
# from scipy.io import arff
# import pandas as pd

# data = arff.loadarff('datasets/emotions.arff')
# df = pd.DataFrame(data[0])
# df

# Main.py

In [22]:
set([x[0] for x in available_data_sets().keys()])

{'Corel5k',
 'bibtex',
 'birds',
 'delicious',
 'emotions',
 'enron',
 'genbase',
 'mediamill',
 'medical',
 'rcv1subset1',
 'rcv1subset2',
 'rcv1subset3',
 'rcv1subset4',
 'rcv1subset5',
 'scene',
 'tmc2007_500',
 'yeast'}

In [23]:
datasets = ['birds-train', 'CAL500', 'emotions', 'enron', 'flags', 'medical', 'scene', 'yeast', 'Corel5k', 'mediamill']
locations = [260, 68, 72, 1001, 19, 1449, 294, 103, 499, 120]
n_labels = [19, 174, 6, 53, 7, 45, 6, 14, 374, 101]
no_clses = [2, 4, 6, 8, 10]

In [24]:
datasets_small = ['emotions', 'scene']
datasets_medium = ['yeast', 'birds', 'genbase']
datasets_large = ['medical', 'enron']
datasets_huge = ['mediamill', 'bibtex', 'Corel5k']
no_clses_small = [2]
no_clses_medium = [2, 4, 6]
no_clses_large = [2, 4, 6, 8, 10]
no_clses_huge = [2, 4, 6, 8, 10]

Standard classification with full features vs Super classification with full features. 

In [28]:
# Main entry

# data = read_arff('datasets/' + datasets[2] + '.arff')

# y = data.iloc[:, locations[2]:]
# X = data.iloc[:, :locations[2]]

# n_features = len(list(X))
# X = X.to_numpy()
# y = y.to_numpy()

X, y, feature_names, label_names = load_dataset(datasets_medium[0], 'undivided')
X = pd.DataFrame.sparse.from_spmatrix(X).to_numpy()
y = pd.DataFrame.sparse.from_spmatrix(y).to_numpy()

n_splits = 5
k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=42)

standard_hams = []
super_hams = []
f_ratios = []
to_print = ''
fold_count = 0

for train_idx, test_idx in k_fold.split(X, y):
    fold_count += 1

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # get super classification acc(hl)
    dict_cls_ham = dict()   # no_cls:super_ham dictionary
    for no_cls in no_clses_medium:        
        X_train_df, X_test_df, y_train_df, y_test_df = Convert_to_df(X_train, X_test, y_train, y_test)            
        y_test_pred_super, y_test_super = calc_preds(X_train_df, y_train_df, X_test_df, y_test_df, no_cls)
        super_ham = hamming_loss(y_test_super, y_test_pred_super)        
        dict_cls_ham[no_cls] = super_ham
        super_hams.append(dict_cls_ham)
    
    # get standard classficaition acc(hl)
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    clf = MLkNN(k=3)
    clf.fit(X_train_scaled, y_train)
    y_test_pred = clf.predict(X_test_scaled)
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    standard_ham = hamming_loss(y_test, y_test_pred)
    standard_hams.append(standard_ham)

    # to write the results
    to_print += '--------------Fold %d----------------\n' % fold_count
    to_print += 'Standard classification with full feature hamming loss: %.4f\n' % standard_ham
    to_print += 'Super classification with full feature hamming loss with 2 clusters: %.4f\n' % dict_cls_ham[2]
    to_print += 'Super classification with full feature hamming loss with 4 clusters: %.4f\n' % dict_cls_ham[4]
    to_print += 'Super classification with full feature hamming loss with 6 clusters: %.4f\n' % dict_cls_ham[6]
#     to_print += 'Super classification with full feature hamming loss with 8 clusters: %.4f\n' % dict_cls_ham[8]
#     to_print += 'Super classification with full feature hamming loss with 10 clusters: %.4f\n' % dict_cls_ham[10]

to_print += '--------------Average----------------\n'
to_print += 'Ave Standard Classification Accuracy: %.4f\n' % np.average(standard_hams)
# to_print += 'Ave Super Classification Accuracy: %.4f\n' % np.average(super_hams)
to_print += 'Ave Super Classification with 2 clusters: %.4f\n' % np.average([super_hams[0][2], super_hams[1][2], super_hams[2][2], super_hams[3][2], super_hams[4][2]])
to_print += 'Ave Super Classification with 4 clusters: %.4f\n' % np.average([super_hams[0][4], super_hams[1][4], super_hams[2][4], super_hams[3][4], super_hams[4][4]])
to_print += 'Ave Super Classification with 6 clusters: %.4f\n' % np.average([super_hams[0][6], super_hams[1][6], super_hams[2][6], super_hams[3][6], super_hams[4][6]])
# to_print += 'Ave Super Classification with 8 clusters: %.4f\n' % np.average([super_hams[0][8], super_hams[1][8], super_hams[2][8], super_hams[3][8], super_hams[4][8]])
# to_print += 'Ave Super Classification with 10 clusters: %.4f\n' % np.average([super_hams[0][10], super_hams[1][10], super_hams[2][10], super_hams[3][10], super_hams[4][10]])

f = open('records/record_' + datasets_medium[0] + '_full_standard_super_clf.txt', 'w')
f.write(to_print)
f.close()

yeast:undivided - does not exists downloading
Downloaded yeast-undivided


In [16]:
# data = read_arff('datasets/' + datasets[6] + '.arff')
# data

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Att291,Att292,Att293,Att294,Beach,Sunset,FallFoliage,Field,Mountain,Urban
0,0.646467,0.666435,0.685047,0.699053,0.652746,0.407864,0.150309,0.535193,0.555689,0.580782,...,0.157332,0.247298,0.014025,0.029709,1,0,0,0,1,0
1,0.770156,0.767255,0.761053,0.745630,0.742231,0.688086,0.708416,0.757351,0.760633,0.740314,...,0.251454,0.137833,0.082672,0.036320,1,0,0,0,0,1
2,0.793984,0.772096,0.761820,0.762213,0.740569,0.734361,0.722677,0.849128,0.839607,0.812746,...,0.017166,0.051125,0.112506,0.083924,1,0,0,0,0,0
3,0.938563,0.949260,0.955621,0.966743,0.968649,0.869619,0.696925,0.953460,0.959631,0.966320,...,0.019267,0.031290,0.049780,0.090959,1,0,0,0,0,0
4,0.512130,0.524684,0.520020,0.504467,0.471209,0.417654,0.364292,0.562266,0.588592,0.584449,...,0.198151,0.238796,0.164270,0.184290,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,0.875782,0.901653,0.926227,0.721366,0.795826,0.867642,0.794125,0.899067,0.908963,0.895336,...,0.215147,0.279607,0.254413,0.134350,0,0,0,0,0,1
2403,0.657706,0.669877,0.692338,0.713920,0.727374,0.750354,0.684372,0.718770,0.719916,0.730645,...,0.217201,0.199491,0.048747,0.041638,0,0,0,0,0,1
2404,0.952281,0.944987,0.905556,0.836604,0.875916,0.957034,0.953938,0.967956,0.819636,0.707311,...,0.028002,0.031900,0.017547,0.019734,0,0,0,0,0,1
2405,0.883990,0.899004,0.901019,0.904298,0.846402,0.858145,0.851362,0.852472,0.876665,0.908187,...,0.239041,0.256158,0.226332,0.223070,0,0,0,0,0,1


Full feature standard classfification acc vs Super classification PSO selected acc

In [54]:
# Main entry

data = read_arff('datasets/' + datasets[6] + '.arff')

y = data.iloc[:, locations[6]:]
X = data.iloc[:, :locations[6]]
n_features = len(list(X))
X = X.to_numpy()
y = y.to_numpy()

n_splits = 5
k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state = 42)

full_hams = []
sel_hams = []
PSO_durations = []
f_ratios = []
to_print = ''
fold_count = 0

for train_idx, test_idx in k_fold.split(X, y):
    print('Fold ', fold_count)
    fold_count += 1

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

        # get full acc
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    
    
    clf = MLkNN(k=3)
    clf.fit(X_train_scaled, y_train)
    y_test_pred = clf.predict(X_test_scaled)
    full_ham = hamming_loss(y_test, y_test_pred)
    full_hams.append(full_ham)
    
    
    #  perform FS
    start_PSO = time.time_ns()    # marking start time of PSO 
#        problem = FS_ML(minimize=True, X=X_train, y=y_train)
    problem = FS_ML_super(minimize=True, X=X_train, y=y_train)

    # parameter for PSO
    pop_size = 3
    n_iterations = 5
    swarm = Swarm(n_particles=pop_size, length=n_features, pos_max=1.0, pos_min=0,
                       vel_max=0.2, vel_min=-0.2, problem=problem, n_iterations=n_iterations)    
    best_sol, best_fit = swarm.iterate()
    end_PSO = time.time_ns()      # marking ending time of PSO
    duration_PSO = round((end_PSO - start_PSO)/1000000000, 2)
    PSO_durations.append(duration_PSO)
    
    
    # process the final solution
    sel_fea = np.where(best_sol > problem.threshold)[0]
    clf.fit(X_train[:, sel_fea], y_train)
    y_test_pred = clf.predict(X_test[:, sel_fea])
    fold_ham = hamming_loss(y_true=y_test, y_pred=y_test_pred)
    sel_hams.append(fold_ham)
    f_ratios.append(len(sel_fea)/n_features)

    # to write the results
    to_print += '--------------Fold %d----------------\n' % fold_count
    to_print += 'Full feature hamming loss: %.4f\n' % full_ham
    to_print += 'Fold selected hamming loss: %.4f\n' % fold_ham
    to_print += 'Time of PSO: %.4f\n' % duration_PSO
    to_print += 'Selection ratio: %.2f\n' % (len(sel_fea)/n_features)
    to_print += 'Selected features: %s\n' % (', '.join([str(ele) for ele in sel_fea]))

to_print += '--------------Average----------------\n'
to_print += 'Ave Full Accuracy: %.4f\n' % np.average(full_hams)
to_print += 'Ave Selection Accuracy: %.4f\n' % np.average(sel_hams)
to_print += 'Ave time of PSO: %.4f\n' % np.average(PSO_durations)
to_print += 'Ave Feature Ratio: %.2f\n' % np.average(f_ratios)

f = open('records/record_' + datasets[6] + '_super_PSO.txt', 'w')
f.write(to_print)
f.close()

Fold  0
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.10470910472386
Iterate  2  gbest value is  0.10166311485847494
Iterate  3  gbest value is  0.10166311485847494
Iterate  4  gbest value is  0.10166311485847494
Fold  1
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.1047325435269018
Iterate  2  gbest value is  0.10452233554469884
Iterate  3  gbest value is  0.10154203061304254
Iterate  4  gbest value is  0.10154203061304254
Fold  2
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.09831199379037225
Iterate  2  gbest value is  0.09831199379037225
Iterate  3  gbest value is  0.09804015334541763
Iterate  4  gbest value is  0.09804015334541763
Fold  3
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.09906150874510802
Iterate  2  gbest value is  0.09849433193971643
Iterate  3  gbest value is  0.09849433193971643
Iterate  4  gbest value is  0.09849433193971643
Fold  4
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.104098

Standard PSO FS time cost vs Super PSO FS time cost

In [55]:
# Main entry

data = read_arff('datasets/' + datasets[6] + '.arff')

y = data.iloc[:, locations[6]:]
X = data.iloc[:, :locations[6]]

n_features = len(list(X))
X = X.to_numpy()
y = y.to_numpy()

n_splits = 5
k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state = 42)

full_hams = []
sel_hams = []
PSO_durations = []
f_ratios = []
to_print = ''
fold_count = 0

for train_idx, test_idx in k_fold.split(X, y):
    print('Fold ', fold_count)
    fold_count += 1

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # get full acc
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    clf = MLkNN(k=3)
    clf.fit(X_train_scaled, y_train)
    y_test_pred = clf.predict(X_test_scaled)
    full_ham = hamming_loss(y_test, y_test_pred)
    full_hams.append(full_ham)
    
    
    #  perform FS
    start_PSO = time.time_ns()    # marking start time of PSO 
    problem = FS_ML(minimize=True, X=X_train, y=y_train)
#     problem = FS_ML_super(minimize=True, X=X_train, y=y_train)

    # parameter for PSO
    pop_size = 3
    n_iterations = 5
    swarm = Swarm(n_particles=pop_size, length=n_features, pos_max=1.0, pos_min=0,
                       vel_max=0.2, vel_min=-0.2, problem=problem, n_iterations=n_iterations)    
    best_sol, best_fit = swarm.iterate()
    end_PSO = time.time_ns()      # marking ending time of PSO
    duration_PSO = round((end_PSO - start_PSO)/1000000000, 2)
    PSO_durations.append(duration_PSO)
    
    # process the final solution
    sel_fea = np.where(best_sol > problem.threshold)[0]
    clf.fit(X_train[:, sel_fea], y_train)
    y_test_pred = clf.predict(X_test[:, sel_fea])
    fold_ham = hamming_loss(y_true=y_test, y_pred=y_test_pred)
    sel_hams.append(fold_ham)
    f_ratios.append(len(sel_fea)/n_features)

    # to write the results
    to_print += '--------------Fold %d----------------\n' % fold_count
    to_print += 'Full feature hamming loss: %.4f\n' % full_ham
    to_print += 'Fold selected hamming loss: %.4f\n' % fold_ham
    to_print += 'Time of PSO: %.4f\n' % duration_PSO
    to_print += 'Selection ratio: %.2f\n' % (len(sel_fea)/n_features)
    to_print += 'Selected features: %s\n' % (', '.join([str(ele) for ele in sel_fea]))

to_print += '--------------Average----------------\n'
to_print += 'Ave Full Accuracy: %.4f\n' % np.average(full_hams)
to_print += 'Ave Selection Accuracy: %.4f\n' % np.average(sel_hams)
to_print += 'Ave time of PSO: %.4f\n' % np.average(PSO_durations)
to_print += 'Ave Feature Ratio: %.2f\n' % np.average(f_ratios)

f = open('records/record_' + datasets[6] + '_standardPSO.txt', 'w')
f.write(to_print)
f.close()

Fold  0
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.10053697819836607
Iterate  2  gbest value is  0.10053697819836607
Iterate  3  gbest value is  0.10053697819836607
Iterate  4  gbest value is  0.10053697819836607
Fold  1
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.10302277941047297
Iterate  2  gbest value is  0.10302277941047297
Iterate  3  gbest value is  0.10302277941047297
Iterate  4  gbest value is  0.10138353021981936
Fold  2
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.0962167647246712
Iterate  2  gbest value is  0.0962167647246712
Iterate  3  gbest value is  0.0962167647246712
Iterate  4  gbest value is  0.0962167647246712
Fold  3
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.10309103367896781
Iterate  2  gbest value is  0.10309103367896781
Iterate  3  gbest value is  0.10309103367896781
Iterate  4  gbest value is  0.09910224523650166
Fold  4
Iterate  0  gbest value is  inf
Iterate  1  gbest value is  0.102335