In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from copy import deepcopy
from random import randint 
import random 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, plot_confusion_matrix, roc_auc_score
from imblearn.metrics import geometric_mean_score
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.mdo import MDO

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
retail = pd.read_csv('cust_summary_clustered.csv', index_col=0)

In [4]:
retail = retail.rename(columns={'cluster':'class'})
retail.head(1)

Unnamed: 0,firstorder_grosssales,firstorder_units,class,loyaltyaccount_No,loyaltyaccount_Yes,gender_female,gender_male,gender_unknown,shipcountry_Albania,shipcountry_Armenia,...,category_Childrens,category_Infant,category_Junior,category_Mens,category_Miscellaneous,category_Nursery,category_Womens,divisioncode_ACCESSORY,divisioncode_APPAREL,divisioncode_FOOTWEAR
0,64.82,2,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [5]:
retail['class'] = retail['class'].replace({2: 1, 1: 2})

In [6]:
print(retail['class'].value_counts())

0    1932069
1     249970
2      22834
Name: class, dtype: int64


In [7]:
retail['class'].value_counts(normalize=True)

0    0.876272
1    0.113372
2    0.010356
Name: class, dtype: float64

In [8]:
print((retail['class'].value_counts()*0.10).astype('int64').to_frame())

    class
0  193206
1   24997
2    2283


In [9]:
retail_0 = retail[(retail['class'] == 0)] 
retail_0 = retail_0.sample(n= 193206)
retail_1 = retail[(retail['class'] == 1)] 
retail_1 = retail_1.sample(n= 24997)
retail_2 = retail[(retail['class'] == 2)] 
retail_2 = retail_2.sample(n= 2283)

In [10]:
retail_down = pd.concat([retail_0, retail_1, retail_2])
retail_down = shuffle(retail_down)
retail_down.name = 'Retail - sample'
retail_down['class'].value_counts(normalize=True)

0    0.876273
1    0.113372
2    0.010354
Name: class, dtype: float64

In [11]:
retail_new = pd.read_csv('clusters_new.csv', index_col=0)
retail_new = retail_new.rename(columns={'cluster':'class'})
retail_new.head()

Unnamed: 0,class,firstorder_grosssales,firstorder_units,month_April,month_August,month_December,month_February,month_January,month_July,month_June,...,category_Nursery,category_Womens,divisioncode_ACCESSORY,divisioncode_APPAREL,divisioncode_FOOTWEAR,loyaltyaccount_No,loyaltyaccount_Yes,gender_female,gender_male,gender_unknown
0,0,66.1,2,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
1,0,66.1,2,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
2,2,17.45,2,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
3,0,36.14,2,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,1,0,0
4,0,253.48,3,0,0,1,0,0,0,0,...,0,1,0,0,1,1,0,1,0,0


In [12]:
print(retail_new['class'].value_counts())

0    1333494
2     749227
1     122134
Name: class, dtype: int64


In [13]:
retail_new['class'].value_counts(normalize=True)

0    0.604799
2    0.339808
1    0.055393
Name: class, dtype: float64

In [14]:
X= retail_new.drop('class',axis=1).copy()
y = retail_new['class'].copy()
y = y.astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2, shuffle=y, stratify=y)

In [15]:
retail_new = pd.DataFrame(X_test, columns = X.columns)
retail_new['class'] = y_test

In [16]:
print(retail_new['class'].value_counts())

0    133350
2     74923
1     12213
Name: class, dtype: int64


In [17]:
retail_new.name = 'Retail - new'

In [18]:
retail_new.head()

Unnamed: 0,firstorder_grosssales,firstorder_units,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,...,category_Womens,divisioncode_ACCESSORY,divisioncode_APPAREL,divisioncode_FOOTWEAR,loyaltyaccount_No,loyaltyaccount_Yes,gender_female,gender_male,gender_unknown,class
654230,77.92,1,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,0,2
1418204,26.21,2,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,1,0,0,0
1940088,71.1,1,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,1,0,0
1765465,26.98,2,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,1,0,0,0
650403,99.68,2,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,1,0,0


In [19]:
# train test/ scaling
def data_prep (data, seed):
  X= data.drop('class',axis=1).copy()
  y = data['class'].copy()
  y = y.astype('category')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=y, stratify=y) # add ssed

  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
  X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [20]:
def random_under_minority (data, imbalance_level, seed):
  if imbalance_level == 'none':
    X_train_scaled, X_test_scaled, y_train, y_test = data_prep(data, 1)
    X_train_scaled = X_train_scaled.reset_index(drop=True).values
    X_test_scaled = X_test_scaled.reset_index(drop=True).values
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
  
  else:
    X_train_scaled, X_test_scaled, y_train, y_test = data_prep(data, 1)
    y_train = y_train.reset_index(drop=True)
    X_train_scaled['class'] = y_train

    class_counts = X_train_scaled['class'].value_counts().to_frame()

    minority_full = pd.DataFrame()
    for c in range(1,len(class_counts)):
      maj_count = class_counts.iloc[0,0]
      min_count = class_counts.iloc[c,0]

      majority = X_train_scaled[(X_train_scaled['class'] == 0)]
      minority = X_train_scaled[(X_train_scaled['class']== c)]

      if imbalance_level == 'absolute':
          downsample = 6
  
      else:
        if imbalance_level == 'high':
          imbalance = 0.05
      
        elif imbalance_level == 'extreme':
          imbalance = 0.01

        downsample = (maj_count * imbalance).round().astype('int')
    
      if imbalance_level == 'extreme' and downsample < 8:
        downsample = 8

      if downsample < 6:
        downsample = 6 
 
      if downsample >= min_count:
        minority_sample = minority
      
      
      else:
        minority_sample = minority.sample(n= downsample)

      minority_full = pd.concat([minority_full, minority_sample])
    final = pd.concat([majority, minority_full])
    final = shuffle(final)

    X_train_scaled = final.drop('class',axis=1).copy()
    y_train = final['class'].copy()
    y_train = y_train.astype('category')

    X_train_scaled = X_train_scaled.reset_index(drop=True).values
    X_test_scaled = X_test_scaled.reset_index(drop=True).values
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
  return X_train_scaled, X_test_scaled, y_train, y_test

In [21]:
def SMOTE_sampling (data, imbalance, seed):
  X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(data, imbalance, seed)

  SMOTE_sample = SMOTE()
  X_train_scaled, y_train = SMOTE_sample.fit_resample(X_train_scaled, y_train)
  
  return X_train_scaled, X_test_scaled, y_train, y_test

In [22]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

class SingularMatrixException(Exception):
    def __init__(self):
        Exception.__init__(self,"Singular data matrix... use subspace") 

def _msqrt(X):
    '''Computes the square root matrix of symmetric square matrix X.'''
    (L, V) = np.linalg.eig(X)
    return V.dot(np.diag(np.sqrt(L))).dot(V.T) 


class SwimMaha:

    def __init__(self, sd=0.25, minClass=None, subSpaceSampling=False):
        self.sd = sd
        self.minClass = minClass
        self.subSpaceSampling = subSpaceSampling

    # the data passed is transposed, so the rows are the features, and the columns are the instances
    def mahaSampling(self, data, labels, numSamples):

        if self.minClass == None:
            self.minClass     = np.argmin(np.bincount(labels.astype(int)))

        syntheticInstances  = []
        data_maj_orig       = data[np.where(labels!=self.minClass)[0], :]
        data_min_orig       = data[np.where(labels==self.minClass)[0], :]
        data_min_orig = data_min_orig+0.0001*np.random.rand((data_min_orig.shape)[0],(data_min_orig.shape)[1])

        if(np.sum(labels==self.minClass)==1):
            data_min_orig = data_min_orig.reshape(1,len(data_min_orig))
            # trnMinData    = trnMinData.reshape(1,len(trnMinData))

        ## STEP 1: CENTRE
        ## CENTRE THE MAJORITY CLASS AND CENTRE THE MINORITY CLASS WITH RESPECT TO THE MAJORITY CLASS
        scaler = StandardScaler(with_std=False)
        T_maj  = np.transpose(scaler.fit_transform(data_maj_orig))
        T_min  = np.transpose(data_min_orig) 

        ## STEP 2: WHITEN
        C_inv = None
        C     = np.cov(T_maj) # the covariance matrix - of the majority class

        # CALCULATE THE RANK OF THE MAJORITY CLASS DATA MATRIX AND INVERT IT IF POSSIBLE
        data_rank = np.linalg.matrix_rank(data_maj_orig) 
        if data_rank < T_maj.shape[0]: # there are linearly dependent column, so inverse will be singular
            if self.subSpaceSampling == False:
                print("The majority class has linearly dependent columns. Rerun the sampling subSpaceSampling=True. Return original data.")
                return data, labels
            else:

                QR = np.linalg.qr(data_maj_orig)
                indep = QR[1].diagonal() > 0
                data = data[:,indep]
                print("The majority class has linearly dependent columns. Resampled data will be in the " + str(sum(indep==True)) + " independent columns of the orginal " + str(data_maj_orig.shape[1]) + "-dimensional data.")

        else:
            try:
                C_inv = np.linalg.inv(C) # inverse of the covariance matrix
            except np.linalg.LinAlgError as e:
                if 'Singular matrix' in str(e):
                    print("Majority class data is singular. Degrading to random oversampling with Gaussian jitter")
                    X_new = data_min_orig[np.random.choice(data_min_orig.shape[0], numSamples, replace=True), :]
                    X_new = X_new + (0.1 * np.random.normal(0, data_maj_orig.std(0), X_new.shape))
                    y_new = np.repeat(self.minClass, numSamples)
                    data   = X_new
                    labels = y_new
                    return data, labels
        
        try:
            M     = _msqrt(C_inv) # C_inv is the inverse of the covariance matrix, and M is the matrix for the whitening transform
            M_inv = np.linalg.inv(M) # this is the inverse of the M matrix, we'll use it for getting the data back.

            W_min      = M.dot(T_min) # whitening transform - whiten the minority class
            W_maj      = M.dot(T_maj) # whitening transform - whiten the majority class
        except:
            print("value excpetion... synthetic instances not generated")
            return data, labels

        ## STEP 3: FIND THE MEANS AND FEATURE BOUNDS TO USE IN THE GENERATION PROCESS
        min_means  = W_min.mean(1)
        min_stds   = W_min.std(1)
        min_ranges_bottom = min_means - self.sd*min_stds
        min_ranges_top    = min_means + self.sd*min_stds

 
        ## STEP 4: GENERATE SYNTHETIC INSTANCES
        # RANDOMLY REPLICATE THE WHITENED MINORITY CLASS INSTNACES <numSamples> TIMES TO GENERATE SYNTHETIC INSTANCES FROM
        smpInitPts = W_min[:, np.random.choice(W_min.shape[1], numSamples)]
        for smpInd in range(smpInitPts.shape[1]): # repeat "times" times, so we get a balanced dataset
            new_w_raw = []
            new       = None
            new_w     = None
            smp       = smpInitPts[:, smpInd]
            for dim in range(len(min_means)):
                new_w_raw.append(random.uniform(smp[dim]-self.sd*min_stds[dim], smp[dim]+self.sd*min_stds[dim]))

            ## Step 5: SCALE BACK TO THE ORIGINAL SPACE
            new_w = np.array(new_w_raw) / ((np.linalg.norm(new_w_raw)/np.linalg.norm(smp)))
            new   = M_inv.dot(np.array(new_w))
               
            syntheticInstances.append(new)
            
        new_data   = np.array(syntheticInstances)
        new_labels = [self.minClass]*len(syntheticInstances)

        return new_data, new_labels

In [23]:
def SWIM_sampling (data, imbalance, seed, num_classes):
  X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(data, imbalance, seed)

  X_train_scaled_new = deepcopy(X_train_scaled)
  y_train_new = deepcopy(y_train)
  for i in range(1, num_classes):
    numSamples = np.sum(y_train==0)-np.sum(y_train==i)
    label = np.array(y_train[(y_train==0)|(y_train==i)])
    data = np.array(X_train_scaled[(y_train==0)|(y_train==i)])
    sw = SwimMaha(sd= 2, minClass=i)
    data_new, new_labels = sw.mahaSampling(data, label, numSamples)
    X_train_scaled_new = np.concatenate([X_train_scaled_new, data_new])
    y_train_new = np.append(y_train_new, new_labels)
  
  return X_train_scaled_new, X_test_scaled, y_train_new, y_test

In [24]:
class SingularMatrixException(Exception):
    def __init__(self):
        Exception.__init__(self,"Singular data matrix... use subspace") 

class MAHAKIL(object):
    def __init__(self, pfp=0.5):
        self.data_t = None  # Save the initial defect sample
        self.pfp = pfp  # Proportion of expected defect samples
        self.T = 0  # Number of defect samples to be generated
        self.new = []  # Store newly generated samples

    # Core method
    # return : data_new, label_new
    def fit_sample(self, data, label, num_classes):
        
        for i in range(1, num_classes):
            
            label_f = np.array(label[label==0])
            label_t = np.array(label[label== i])
            
            data_f = np.array(data[label==0])
            data_t = np.array(data[label== i])
            
            data_t = data_t+0.00001*np.random.rand((data_t.shape)[0],(data_t.shape)[1])
            self.T = int(len(data_f) / (1 - self.pfp) - len(data_f))
            self.data_t = np.array(data_t)
            
            # Calculate the Mahalanobis distance
            d = 0
            d = self.mahalanobis_distance(self.data_t)
        
            d = pd.DataFrame (d,columns=['Malhabonis Distance'])
            d = d.reset_index(drop=False)
            d = d.values.tolist()
        

            # Descending order
            d.sort(key=lambda x: x[1], reverse=True)
            # Divide the set of positive examples into two
            k = len(d)
            d_index = [d[i][0] for i in range(k)]
            d_index = [ int(d) for d in d_index ]
            data_t_sorted = [data_t[i] for i in d_index]
            
            mid = int(k/2)
            bin1 = [data_t_sorted[i] for i in range(0, mid)]
            bin2 = [data_t_sorted[i] for i in range(mid, k)]
            # Loop iteration to generate new samples
            l_ = len(bin1)
            mark = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535, 131071, 262143, 524287]
            p = self.T / (l_ +0.0)
            is_full = True
            g = mark.index([m for m in mark if m > p][0]) + 1
            cluster = 2 ** (g - 1)  # Number of children of the last generation
            if (self.T - mark[g-2]*l_) < cluster:
                # Explain that adding more generations is better than keeping a few
                is_full = False
                g -= 1
                k = 0
            else:
                k = l_ - round((self.T - mark[g-2]*l_)/cluster)
            self.generate_new_sample(bin1, bin2, g, l_, k, is_full)
            # Return data and labels
            label_new = np.ones(len(self.new))+ (i-1)
            data = np.append(data, self.new, axis=0)
            data = pd.DataFrame(data)
            label = np.append(label, label_new, axis=0)
        return data, label 

    def mahalanobis_distance(self, x):
        x_mu = x - np.mean(x)
        cov = np.cov(x.T)
        inv_covmat = np.linalg.inv(cov)
        left = np.dot(x_mu, inv_covmat)
        mahal = np.dot(left, x_mu.T).diagonal()
        return mahal


    # Generate new samples
    def generate_new_sample(self, bin1, bin2, g, l, k, is_full):
        # bin1, bin2 are arrays
        # g Hereditary remaining algebra
        # l bin1 number of items
        # k The number of each node to be cropped in the last generation
        # is_full whether it overflows, that is, the last generation is counted, whether it exceeds T, or is not full
        assert len(bin1) <= len(bin2)
        if g >= 2 or (g == 1 and is_full is False):
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            self.new.extend(lv_0)
            self.generate_new_sample(lv_0, bin1, g-1, l, k, is_full)
            self.generate_new_sample(lv_0, bin2, g-1, l, k, is_full)
        if g == 1 and is_full:
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            del lv_0[-1: (-k-1): -1]
            self.new.extend(lv_0)

In [25]:
def MAHAKIL_sampling (data, imbalance, seed, num_classes):
  X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(data, imbalance, seed)

  mk = MAHAKIL(pfp=0.5)
  X_train_scaled, y_train = mk.fit_sample(X_train_scaled, y_train, num_classes)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [26]:
def MDO_sampling (data, imbalance, seed, num_classes):
  X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(data, imbalance, seed)

  mk = MAHAKIL(pfp=0.5)
  X_train_scaled, y_train = mk.fit_sample(X_train_scaled, y_train, num_classes)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [27]:
from itertools import combinations
def a_value(probabilities, zero_label=0, one_label=1):
    # Obtain a list of the probabilities for the specified zero label class
    expanded_points = []
    for instance in probabilities:
        if instance[0] == zero_label or instance[0] == one_label:
            expanded_points.append((instance[0], instance[1],zero_label))
    sorted_ranks = sorted(expanded_points, key=lambda x: x[1])

    n0, n1, sum_ranks = 0, 0, 0
    # Iterate through ranks and increment counters for overall count and ranks of class 0
    for index, point in enumerate(sorted_ranks):
        if point[0] == zero_label:
            n0 += 1
            sum_ranks += index + 1  # Add 1 as ranks are one-based
        elif point[0] == one_label:
            n1 += 1
        else:
            pass  # Not interested in this class
        if n0 ==0:
          n0 = 1
        if n1 == 0:
          n1 = 1


    return (sum_ranks - (n0*(n0+1)/2.0)) / float(n0 * n1)  # Eqn 3

def MAUC(data, num_classes):
    # Find all pairwise comparisons of labels
    class_pairs = [x for x in combinations(range(num_classes), 2)]

    # Have to take average of A value with both classes acting as label 0 as this
    # gives different outputs for more than 2 classes
    sum_avals = 0
    for pairing in class_pairs:
        sum_avals += (a_value(data, zero_label=pairing[0], one_label=pairing[1]) +
                      a_value(data, zero_label=pairing[1], one_label=pairing[0])) / 2.0

    return sum_avals * (2 / float(num_classes * (num_classes-1)))  # Eqn 7

In [28]:
def classifier_results (data, imbalance, sampling_method, model, seed):

  num_classes = len(data['class'].value_counts())
  
  if sampling_method == 'none':
    X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(data, imbalance, seed)

  if sampling_method == 'SMOTE':
    X_train_scaled, X_test_scaled, y_train, y_test = SMOTE_sampling (data, imbalance, seed)
  
  if sampling_method == 'SWIM':
    X_train_scaled, X_test_scaled, y_train, y_test = SWIM_sampling (data, imbalance, seed, num_classes)
  
  if sampling_method == 'MAHAKIL':
    X_train_scaled, X_test_scaled, y_train, y_test = MAHAKIL_sampling(data, imbalance, seed, num_classes)
  
  if sampling_method == 'MDO':
    X_train_scaled, X_test_scaled, y_train, y_test = MDO_sampling(data, imbalance, seed, num_classes)

  if model == 'naive_bayes':
    model_func = GaussianNB()

  elif model == 'K_neighbours':
    model_func = KNeighborsClassifier(n_neighbors=3)
  
  elif model == 'Random_forest':
    model_func = RandomForestClassifier()
  
  elif model == 'SVM':
    model_func = SVC(kernel='rbf', gamma=1, C=1, probability=True, decision_function_shape='ovo')

  model_base = model_func
  model_base.fit(X_train_scaled, y_train)

  y_pred = model_base.predict(X_test_scaled)
  y_pred_probs = model_base.predict_proba(X_test_scaled)

  probabilities = []
  for i in range(len(y_pred)):
    element = (y_pred_probs[i])
    element = np.insert(element, 0, y_pred[i])
    probabilities.append(element)
  
  score_MAUC = MAUC(probabilities, num_classes)

  f1 = f1_score(y_test, y_pred, average=None)
  score_f1 = sum(f1[1:])/ (len(f1)-1)
  GMS = geometric_mean_score(y_test, y_pred, average='macro')


  return [data.name, imbalance, sampling_method, model, score_f1, score_MAUC, GMS]

In [37]:
results = classifier_results(retail_new, 'none', 'MDO','naive_bayes', 1)

In [36]:
results

['Retail - new',
 'none',
 'SWIM',
 'naive_bayes',
 0.43254517510333595,
 0.49994055360159956,
 0.6752426915434973]

In [34]:
results

['Retail - new',
 'none',
 'none',
 'naive_bayes',
 0.43215869264052736,
 0.49994013413906596,
 0.6746967761676528]

In [32]:
results

['Retail - new',
 'none',
 'SMOTE',
 'naive_bayes',
 0.38730789342376387,
 0.4999334597543541,
 0.6436038034955949]

In [38]:
results

['Retail - new',
 'none',
 'MDO',
 'naive_bayes',
 0.3596657693875504,
 0.4999447321400773,
 0.6174721794083832]

In [71]:
all_results = []
datasets = [WineW3, WineR3, vowel_3, WineW5, WineR5, ecoli, heart, glass]
imbalance = ['none','high','extreme']
sampling_method = ['none','SMOTE', 'MAHAKIL','SWIM', 'MDO']
models = ['naive_bayes', 'Random_forest']
for data in datasets:
  for i in imbalance: 
    for s in sampling_method:
      for m in models:
        for seed in range(3):
          try:
            results = classifier_results (data, i, s, m, seed) 
            all_results.append(results)
          except:
            pass

df = pd.DataFrame(all_results, columns=['Data', 'Imbalance_level','Sampling_method', 'Model', 'F1', 'MAUC', 'G_mean'])
df.to_csv('multi.csv')

NameError: name 'WineW3' is not defined

In [72]:
df = pd.read_csv('multi.csv', index_col=0)
df.head()

Unnamed: 0,Data,Imbalance_level,Sampling_method,Model,F1,MAUC,G_mean
0,Wine white - 3 class,none,none,naive_bayes,0.184659,0.48692,0.561472
1,Wine white - 3 class,none,none,naive_bayes,0.184659,0.48692,0.561472
2,Wine white - 3 class,none,none,naive_bayes,0.184659,0.48692,0.561472
3,Wine white - 3 class,none,none,Random_forest,0.417467,0.452179,0.625266
4,Wine white - 3 class,none,none,Random_forest,0.445628,0.455872,0.637041


In [10]:
test = df.groupby(
   ['Data','Sampling_method','Imbalance_level', 'Model']
).agg(
    {  
         'F1': "mean",  
         'MAUC': 'mean',
         'G_mean': "mean",  
    }
).round(decimals=3)
test = test.reset_index()
test

Unnamed: 0,Data,Sampling_method,Imbalance_level,Model,F1,MAUC,G_mean
0,Ecoli 1,MAHAKIL,extreme,Random_forest,0.723,0.418,0.871
1,Ecoli 1,MAHAKIL,extreme,naive_bayes,0.432,0.334,0.746
2,Ecoli 1,MAHAKIL,high,Random_forest,0.656,0.409,0.851
3,Ecoli 1,MAHAKIL,high,naive_bayes,0.433,0.412,0.767
4,Ecoli 1,MAHAKIL,none,Random_forest,0.806,0.404,0.900
...,...,...,...,...,...,...,...
223,Wine white - 5 class,none,extreme,naive_bayes,0.081,0.351,0.440
224,Wine white - 5 class,none,high,Random_forest,0.226,0.445,0.508
225,Wine white - 5 class,none,high,naive_bayes,0.191,0.478,0.498
226,Wine white - 5 class,none,none,Random_forest,0.540,0.470,0.689


In [16]:
test2 = test.groupby(
   ['Data','Sampling_method','Imbalance_level']
).agg(
    {  
         'F1': "max",  
         'MAUC': 'max',
         'G_mean': "max",  
    }
).round(decimals=3)
test2 = test2.reset_index()
test2

Unnamed: 0,Data,Sampling_method,Imbalance_level,F1,MAUC,G_mean
0,Ecoli 1,MAHAKIL,extreme,0.723,0.418,0.871
1,Ecoli 1,MAHAKIL,high,0.656,0.412,0.851
2,Ecoli 1,MAHAKIL,none,0.806,0.404,0.900
3,Ecoli 1,MDO,extreme,0.748,0.408,0.877
4,Ecoli 1,MDO,high,0.682,0.412,0.837
...,...,...,...,...,...,...
110,Wine white - 5 class,SWIM,high,0.304,0.495,0.624
111,Wine white - 5 class,SWIM,none,0.492,0.494,0.724
112,Wine white - 5 class,none,extreme,0.081,0.351,0.440
113,Wine white - 5 class,none,high,0.226,0.478,0.508


In [23]:
group = test2.copy(deep=True)
high = group[(group['Imbalance_level']== 'high')]
high = high.groupby(
   ['Data','Sampling_method']
).agg(
    {  
         'F1': "mean",  
         'MAUC': 'mean',
         'G_mean': 'mean'  
    }
).round(decimals=3)
high = high.reset_index()
high['Dataset']= high['Data']
high['Dataset']= high['Dataset'].replace({'Ecoli 1': 'D1', 'Glass 1': 'D2', 'Heart 1': 'D3', 'Vowel - 3 classes': 'D4', 'Wine red - 3 class':'D5', 'Wine white - 3 class':'D6', 'Wine white - 5 class':'D7'})
high

Unnamed: 0,Data,Sampling_method,F1,MAUC,G_mean,Dataset
0,Ecoli 1,MAHAKIL,0.656,0.412,0.851,D1
1,Ecoli 1,MDO,0.682,0.412,0.837,D1
2,Ecoli 1,SMOTE,0.719,0.408,0.854,D1
3,Ecoli 1,SWIM,0.736,0.408,0.871,D1
4,Ecoli 1,none,0.676,0.411,0.849,D1
5,Glass 1,MAHAKIL,0.58,0.266,0.748,D2
6,Glass 1,MDO,0.433,0.244,0.65,D2
7,Glass 1,SMOTE,0.652,0.287,0.794,D2
8,Glass 1,SWIM,0.666,0.309,0.808,D2
9,Glass 1,none,0.545,0.265,0.711,D2


In [34]:
high_csv = high[['Dataset','Sampling_method','F1','MAUC','G_mean']]
high_csv.head()
high_csv.to_csv('multi_high.csv')

In [None]:
df = pd.read_json("/content/drive/MyDrive/Comparison_datasets.json")
df = df.replace({'none': 'None', 'high': 'High', 'extreme':'Extreme', 'absolute':'Absolute'})

In [None]:
df

In [None]:
group = df.copy(deep=True)
high = group[(group['Imbalance_level']== 'High')]
high = high.groupby(
   ['Sampling_method', 'Data']
).agg(
    {  
         'F1': "mean",  
         'AUC': 'mean',
         'G_mean': "mean",  
    }
).round(decimals=3)
high = high.reset_index()
high = high.sort_values(by=['Data', 'Sampling_method'])
high

In [None]:
group = df.copy(deep=True)
extreme = group[(group['Imbalance_level']== 'High')]
extreme = extreme.groupby(
   ['Sampling_method', 'Data']
).agg(
    {  
         'F1': "mean",  
         'AUC': 'mean',
         'G_mean': "mean",  
    }
).round(decimals=3)
extreme = extreme.reset_index()
extreme = extreme.sort_values(by=['Data', 'Sampling_method'])
extreme

In [None]:
group = df.copy(deep=True)
absolute = group[(group['Imbalance_level']== 'High')]
absolute = absolute.groupby(
   ['Sampling_method', 'Data']
).agg(
    {  
         'F1': "mean",  
         'AUC': 'mean',
         'G_mean': "mean",  
    }
).round(decimals=3)
absolute = absolute.reset_index()
absolute

In [None]:
df["Imbalance_level/ Sampling method"] = df["Imbalance_level"] + "/" + df["Sampling_method"]
df["Imbalance_level/ Sampling method"].unique()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 5), sharey=True)
fig.suptitle('F1 Model Score by Data Imbalance')
high = df[(df['Imbalance_level']== 'High')]
extreme = df[(df['Imbalance_level']== 'Extreme')]
absolute = df[(df['Imbalance_level']== 'Absolute')]
sns.boxplot(ax=axes[0], x="Model", y="F1", data=high, palette="mako")
axes[0].set_title("High Imbalance")
sns.boxplot(ax=axes[1], x="Model", y="F1", data=extreme, palette="mako")
axes[1].set_title("Extreme Imbalance")
sns.boxplot(ax=axes[2], x="Model", y="F1", data= absolute, palette="mako")
axes[2].set_title("Absolute Imbalance")

In [None]:
plt.figure(figsize=(20,4))
sns.boxplot(x="Imbalance_level/ Sampling method", y="F1", data=df, palette="mako")
plt.axvline(x=3.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=7.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=11.5, ymin=0, ymax=40000, color='black')
plt.title('F1 score per sampling method - for three imbalance levels ')

In [None]:
plt.figure(figsize=(20,4))
sns.boxplot(x="Imbalance_level/ Sampling method", y="AUC", data=df, palette="mako")
plt.axvline(x=3.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=7.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=11.5, ymin=0, ymax=40000, color='black')
plt.title('AUC score per sampling method - for three imbalance levels ')

In [None]:
plt.figure(figsize=(20,4))
sns.boxplot(x="Imbalance_level/ Sampling method", y="G_mean", data=df, palette="mako")
plt.axvline(x=3.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=7.5, ymin=0, ymax=40000, color='black')
plt.axvline(x=11.5, ymin=0, ymax=40000, color='black')
plt.title('Geometric mean score per sampling method - for three imbalance levels ')

In [None]:
import multi_imbalance.resampling.mdo as sample_mdo

In [None]:
 !pip install multi_imbalance