In [2]:
# ! conda install -c conda-forge imbalanced-learn -y
# ! pip install nose
# ! pip install imbalanced-ensemble           
# ! pip install threadpoolctl

In [20]:
# https://github.com/Mimimkh/SMOTE-ENC-code/
from scipy import stats
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import check_array, _safe_indexing, sparsefuncs_fast, check_X_y, check_random_state
from numbers import Integral
from sklearn.neighbors import NearestNeighbors
from scipy import sparse



class mSMOTENC():
    
    def __init__(self, categorical_features):
        self.categorical_features = categorical_features
        
    def chk_neighbors(self, nn_object, additional_neighbor):
        if isinstance(nn_object, Integral):
            return NearestNeighbors(n_neighbors=nn_object + additional_neighbor)
        elif isinstance(nn_object, KNeighborsMixin):
            return clone(nn_object)
        else:
            raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object)     
    
    def generate_samples(self, X, nn_data, nn_num, rows, cols, steps, continuous_features_,):
        rng = check_random_state(42)

        diffs = nn_data[nn_num[rows, cols]] - X[rows]

        if sparse.issparse(X):
            sparse_func = type(X).__name__
            steps = getattr(sparse, sparse_func)(steps)
            X_new = X[rows] + steps.multiply(diffs)
        else:
            X_new = X[rows] + steps * diffs 

        X_new = (X_new.tolil() if sparse.issparse(X_new) else X_new)
        # convert to dense array since scipy.sparse doesn't handle 3D
        nn_data = (nn_data.toarray() if sparse.issparse(nn_data) else nn_data)
        all_neighbors = nn_data[nn_num[rows]]

        for idx in range(continuous_features_.size, X.shape[1]):
            mode = stats.mode(all_neighbors[:, :, idx], axis = 1)[0]
            X_new[:, idx] = np.ravel(mode)

        return X_new
    
    def make_samples(self, X, y_dtype, y_type, nn_data, nn_num, n_samples, continuous_features_, step_size=1.0):
        random_state = check_random_state(42)
        samples_indices = random_state.randint(low=0, high=len(nn_num.flatten()), size=n_samples)    
        steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis]
        rows = np.floor_divide(samples_indices, nn_num.shape[1])
        cols = np.mod(samples_indices, nn_num.shape[1])

        X_new = self.generate_samples(X, nn_data, nn_num, rows, cols, steps, continuous_features_)
        y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
        
        return X_new, y_new
    
    def cat_corr_pandas(self, X, target_df, target_column, target_value):
    # X has categorical columns
        categorical_columns = list(X.columns)
        X = pd.concat([X, target_df], axis=1)

        # filter X for target value
        is_target = X.loc[:, target_column] == target_value
        X_filtered = X.loc[is_target, :]

        X_filtered.drop(target_column, axis=1, inplace=True)

        # get columns in X
        nrows = len(X)
        encoded_dict_list = []
        nan_dict = dict({})
        c = 0
        imb_ratio = len(X_filtered)/len(X)
        OE_dict = {}
        
        for column in categorical_columns:
            for level in list(X.loc[:, column].unique()):
                
                # filter rows where level is present
                row_level_filter = X.loc[:, column] == level
                rows_in_level = len(X.loc[row_level_filter, :])
                
                # number of rows in level where target is 1
                O = len(X.loc[is_target & row_level_filter, :])
                E = rows_in_level * imb_ratio
                # Encoded value = chi, i.e. (observed - expected)/expected
                ENC = (O - E) / E
                OE_dict[level] = ENC
                
            encoded_dict_list.append(OE_dict)

            X.loc[:, column] = X[column].map(OE_dict)
            # print(f'X.loc[:, {column}]', X.loc[:, column])
            # nan_idx_array = np.ravel(np.argwhere(np.isnan(X.loc[:, column])))
            nan_idx_array = np.array([0])
            if len(nan_idx_array) > 0 :
                nan_dict[c] = nan_idx_array
            c = c + 1
            X.loc[:, column].fillna(-1, inplace = True)
                
        X.drop(target_column, axis=1, inplace=True)
        return X, encoded_dict_list, nan_dict

    def fit_resample(self, X, y):

        X_cat_encoded, encoded_dict_list, nan_dict = self.cat_corr_pandas(X.iloc[:,np.asarray(self.categorical_features)],  y, target_column='fake_cat1', target_value=1)

        X_cat_encoded = np.array(X_cat_encoded)
        y = np.ravel(y)
        X = np.array(X)

        unique, counts = np.unique(y, return_counts=True)
        target_stats = dict(zip(unique, counts))
        n_sample_majority = max(target_stats.values())
        class_majority = max(target_stats, key=target_stats.get)
        sampling_strategy = {key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_majority}

        n_features_ = X.shape[1]
        categorical_features = np.asarray(self.categorical_features)
        if categorical_features.dtype.name == 'bool':
            categorical_features_ = np.flatnonzero(categorical_features)
        else:
            if any([cat not in np.arange(n_features_) for cat in categorical_features]):
                raise ValueError('Some of the categorical indices are out of range. Indices'
                            ' should be between 0 and {}'.format(n_features_))
            categorical_features_ = categorical_features

        continuous_features_ = np.setdiff1d(np.arange(n_features_),categorical_features_)

        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc'])
        X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))

        if sparse.issparse(X):
            if X.format == 'csr':
                _, var = sparsefuncs_fast.csr_mean_variance_axis0(X_minority)
            else:
                _, var = sparsefuncs_fast.csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, categorical_features_]
        X_copy = np.hstack((X_continuous, X_categorical))
        
        X_cat_encoded = X_cat_encoded * median_std_
        X_encoded = np.hstack((X_continuous, X_cat_encoded))
        X_resampled = X_encoded.copy()
        y_resampled = y.copy()


        for class_sample, n_samples in sampling_strategy.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X_encoded, target_class_indices)
            nn_k_ = self.chk_neighbors(5, 1)
            nn_k_.fit(X_class)
            nns = nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]

            X_new, y_new = self.make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, continuous_features_, 1.0)
            if sparse.issparse(X_new):
                X_resampled = sparse.vstack([X_resampled, X_new])
                sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr'
                X_resampled = getattr(X_resampled, sparse_func)()
            else:
                X_resampled = np.vstack((X_resampled, X_new))
            y_resampled = np.hstack((y_resampled, y_new))

        X_resampled_copy = X_resampled.copy()
        i = 0
        for col in range(continuous_features_.size, X.shape[1]):
            encoded_dict = encoded_dict_list[i]
            i = i + 1
            for key, value in encoded_dict.items():
                X_resampled_copy[:, col] = np.where(np.round(X_resampled_copy[:, col], 4) == np.round(value * median_std_, 4), key, X_resampled_copy[:, col])

        for key, value in nan_dict.items():
            for item in value:
                X_resampled_copy[item, continuous_features_.size + key] = X_copy[item, continuous_features_.size + key]

               
        X_resampled = X_resampled_copy   
        indices_reordered = np.argsort(np.hstack((continuous_features_, categorical_features_)))
        if sparse.issparse(X_resampled):
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]
        return X_resampled, y_resampled


In [34]:
# from SMOTE_ENC import mSMOTENC

dataset = 'yeast4-5-fold'

mcc = []
f1 = []
auc_a = []
gmean = []
times = []
y_preds = []
precisions = []
recalls = []
traFiles = sorted(glob.glob(base_path+dataset+'/*tra.xlsx'))
tstFiles = sorted(glob.glob(base_path+dataset+'/*tst.xlsx'))
for traPath, tstPath in zip(traFiles, tstFiles):
    print(traPath)

    df_train = pd.read_excel(traPath)
    df_test = pd.read_excel(tstPath)

    x_train= df_train.iloc[:, 1:-1]
    y_train = df_train.iloc[:, -1]
    x_test= df_test.iloc[:, 1:-1]
    y_test = df_test.iloc[:, -1]

    #####
    # for some dataset get error Unknown label type: 'unknown'
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')

    x_train['fake_cat1'] = 1
    print(type(x_train), x_train.shape[1], x_train)
    
    st = time.time()
    
    msmotenc = mSMOTENC(categorical_features=[x_train.shape[1]-1])
    X_resampled, y_resampled = msmotenc.fit_resample(x_train, y_train)
    print(X_resampled.shape, [np.unique(X_resampled[:,i]) for i in range(x_train.shape[1]) ])
    break
    
    
#     clf = classifiers[classifier]
        
#     clf.fit(x_train, y_train)

#     y_pred = clf.predict(x_test) 
#     et = time.time()
#     y_preds.append(y_pred)
#     # compute error
#     mcc.append(matthews_corrcoef(y_test, y_pred))
#     #--------------------------------
#     fpr, tpr, thresholds = roc_curve(y_test, y_pred)
#     auc_a.append(auc(fpr, tpr))
#     #--------------------------------
#     f1.append(f1_score(y_test, y_pred))
#     #--------------------------------
#     gmean.append(geometric_mean_score(y_test, y_pred, labels=[1, -1]))

#     #time of train and test
#     times.append(et - st)

#     precisions.append(precision_score(y_test, y_pred))
#     recalls.append(precision_score(y_test, y_pred))

#     {"precision": precisions, "recall": recalls, "mcc": mcc, "auc": auc_a, "f1": f1, "gmean": gmean, "exe_time": times, "y_pred": y_pred}


Datasets/yeast4-5-fold/yeast4-5-1tra.xlsx
<class 'pandas.core.frame.DataFrame'> 8        Gvh   Alm   Mit  Erl  Pox   Vac   Nuc  fake_cat1
0     0.61  0.47  0.13  0.5  0.0  0.48  0.22          1
1     0.67  0.48  0.27  0.5  0.0  0.53  0.22          1
2     0.62  0.49  0.15  0.5  0.0  0.53  0.22          1
3     0.44  0.48  0.54  0.5  0.0  0.48  0.22          1
4     0.54  0.48  0.65  0.5  0.0  0.53  0.22          1
...    ...   ...   ...  ...  ...   ...   ...        ...
1182  0.40  0.66  0.35  0.5  0.0  0.43  0.11          1
1183  0.62  0.43  0.17  0.5  0.0  0.53  0.22          1
1184  0.43  0.61  0.40  0.5  0.0  0.48  0.47          1
1185  0.40  0.60  0.16  0.5  0.0  0.53  0.39          1
1186  0.54  0.54  0.13  0.5  0.0  0.53  0.22          1

[1187 rows x 8 columns]
(2294, 8) [array([0.13, 0.14, 0.16, ..., 0.9 , 0.92, 0.94]), array([0.21, 0.22, 0.24, ..., 0.75, 0.79, 1.  ]), array([0.        , 0.04      , 0.04309743, ..., 0.86      , 0.87      ,
       1.        ]), array([0.5       

In [1]:
import re

def read_dot_dat_file(path):
    datContent = [i.strip().split() for i in open(path).readlines()]
    r = re.compile("@inputs.*")
    _at_data = datContent.index(['@data'])
    assert datContent[0][0] == '@relation'
    assert datContent[_at_data-1][0] == '@outputs'
    assert datContent[_at_data-2][0] == '@inputs'
    print(datContent[_at_data-3][2:])
    assert len(datContent[_at_data-3][2:]) == 2   # Two Class

    col_names = datContent[_at_data-2][1:]
    col_names.append(datContent[_at_data-1][1])
    
    df = pd.read_csv(path, skiprows=_at_data+1, names=col_names, sep=r', ', engine='python')
    # df = pd.read_csv(path, skiprows=_at_data+1, names=col_names, sep=", ", engine='python')

    class1 = datContent[_at_data-3][2:][0].replace("{","").replace(",","")
    class2 = datContent[_at_data-3][2:][1].replace("}","").replace(",","")

    df['Class'] = df['Class'].replace({class1: 1, class2: -1})
    
    return df



# temp

In [4]:
# import pandas as pd 
# import numpy as np
# import os

# base_path = 'Datasets/'
# # need_to_convert = ['wisconsin-5-fold','new-thyroid2-5-fold','new-thyroid1-5-fold']
# # need_to_convert = ['yeast-1_vs_7-5-fold', 'led7digit-0-2-4-5-6-7-8-9_vs_1-5-fold', 'ecoli-0-3-4-6_vs_5-5-fold', 'abalone19-5-', 'abalone19-5-fold']
# # need_to_convert = ['ecoli-0-1_vs_5-5-fold','ecoli-0-1-4-7_vs_5-6-5-fold','ecoli-0-3-4-7_vs_5-6-5-fold','glass-0-1-4-6_vs_2-5-fold', 'ecoli-0-4-6_vs_5-5-fold',
# #                   'yeast-0-3-5-9_vs_7-8-5-fold','yeast-1-2-8-9_vs_7-5-fold']
# need_to_convert = ['yeast-1-2-8-9_vs_7-5-fold']
# for needed in need_to_convert:
#     for datFile in glob.glob(base_path+needed+'/*.dat'):
#         print(datFile)
#         df = read_dot_dat_file(datFile)
#         df.to_excel(base_path+needed+'/'+datFile.split('/')[-1].split('.')[0]+'.xlsx')


In [2]:
import time
import os
import glob
import numpy as np
import random  
import pandas as pd 
import random

from sklearn.tree import DecisionTreeClassifier 

from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc,f1_score, matthews_corrcoef, precision_score, recall_score
from imblearn.metrics import geometric_mean_score


from imblearn.ensemble import RUSBoostClassifier
from imbalanced_ensemble.ensemble import SMOTEBoostClassifier
from maatpy.classifiers import AdaCost


In [2]:
# ## Apply the random-forest classifier along with the NEW SMOTE method
# from sklearn.model_selection import GridSearchCV 
# from sklearn.metrics import classification_report
# from sklearn.model_selection import StratifiedKFold
# from imblearn.pipeline import Pipeline, make_pipeline
# from sklearn.preprocessing import LabelEncoder




# # samp_pipeline = make_pipeline(mSMOTENC(categorical_features = []), 
# #                               DecisionTreeClassifier(max_depth=10))
# # # check model performance on different values of hyper-parameters.
# # grid_search = GridSearchCV(samp_pipeline, param_grid=param_grid, cv=kfold, scoring='balanced_accuracy',
# #                         return_train_score=True, n_jobs = -1, verbose = 2)
# # grid_search.fit(X_train, y_train)
# # best_grid = grid_search.best_estimator_
# # best_grid

# mSMOTENC

In [3]:

base_path = 'Datasets/'
datasets  = os.listdir(base_path) 
classifiers = {"RUS": RUSBoostClassifier(random_state=0, algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=10)),
              "SMOTE": SMOTEBoostClassifier(estimator=DecisionTreeClassifier(max_depth=10), n_estimators = 100, algorithm='SAMME', random_state=0),
              "Ada1": AdaCost(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=100, algorithm='adac1',random_state=0),
              "AdaCost": AdaCost(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=100, algorithm='adacost',random_state=0)}
            

def clasify(dataset, classifier):
    mcc = []
    f1 = []
    auc_a = []
    gmean = []
    times = []
    y_preds = []
    precisions = []
    recalls = []
    traFiles = sorted(glob.glob(base_path+dataset+'/*tra.xlsx'))
    tstFiles = sorted(glob.glob(base_path+dataset+'/*tst.xlsx'))
    for traPath, tstPath in zip(traFiles, tstFiles):
        print(traPath)
        
        df_train = pd.read_excel(traPath)
        df_test = pd.read_excel(tstPath)

        x_train= df_train.iloc[:, 1:-1]
        y_train = df_train.iloc[:, -1]
        x_test= df_test.iloc[:, 1:-1]
        y_test = df_test.iloc[:, -1]
        
        #####
        # for some dataset get error Unknown label type: 'unknown'
        y_train = y_train.astype('int')
        y_test = y_test.astype('int')
        
        st = time.time()
        clf = classifiers[classifier]

        clf.fit(x_train, y_train)

        y_pred = clf.predict(x_test) 
        et = time.time()
        y_preds.append(y_pred)
        # compute error
        mcc.append(matthews_corrcoef(y_test, y_pred))
        #--------------------------------
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        auc_a.append(auc(fpr, tpr))
        #--------------------------------
        f1.append(f1_score(y_test, y_pred))
        #--------------------------------
        gmean.append(geometric_mean_score(y_test, y_pred, labels=[1, -1]))
        
        #time of train and test
        times.append(et - st)
        
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(precision_score(y_test, y_pred))
        
    return {"precision": precisions, "recall": recalls, "mcc": mcc, "auc": auc_a, "f1": f1, "gmean": gmean, "exe_time": times, "y_pred": y_pred}

# aaaaaa =  clasify(datasets[0], 'Ada1')
# aaaaaa =  clasify('yeast-1-2-8-9_vs_7-5-fold', 'RUS')
# aaaaaa
# clasify('yeast4-5-fold', 'OUBoost')

In [8]:

dataset = 'yeast4-5-fold'

mcc = []
f1 = []
auc_a = []
gmean = []
times = []
y_preds = []
precisions = []
recalls = []
traFiles = sorted(glob.glob(base_path+dataset+'/*tra.xlsx'))
tstFiles = sorted(glob.glob(base_path+dataset+'/*tst.xlsx'))
for traPath, tstPath in zip(traFiles, tstFiles):
    print(traPath)

    df_train = pd.read_excel(traPath)
    df_test = pd.read_excel(tstPath)

    x_train= df_train.iloc[:, 1:-1]
    y_train = df_train.iloc[:, -1]
    x_test= df_test.iloc[:, 1:-1]
    y_test = df_test.iloc[:, -1]

    #####
    # for some dataset get error Unknown label type: 'unknown'
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')

    st = time.time()
    msmotenc = mSMOTENC([])
    print(msmotenc.fit_resample(x_train, y_train))
    break
    
    
#     clf = classifiers[classifier]
        
#     clf.fit(x_train, y_train)

#     y_pred = clf.predict(x_test) 
#     et = time.time()
#     y_preds.append(y_pred)
#     # compute error
#     mcc.append(matthews_corrcoef(y_test, y_pred))
#     #--------------------------------
#     fpr, tpr, thresholds = roc_curve(y_test, y_pred)
#     auc_a.append(auc(fpr, tpr))
#     #--------------------------------
#     f1.append(f1_score(y_test, y_pred))
#     #--------------------------------
#     gmean.append(geometric_mean_score(y_test, y_pred, labels=[1, -1]))

#     #time of train and test
#     times.append(et - st)

#     precisions.append(precision_score(y_test, y_pred))
#     recalls.append(precision_score(y_test, y_pred))

#     {"precision": precisions, "recall": recalls, "mcc": mcc, "auc": auc_a, "f1": f1, "gmean": gmean, "exe_time": times, "y_pred": y_pred}


Datasets/yeast4-5-fold/yeast4-5-1tra.xlsx


NameError: name 'check_array' is not defined

In [89]:
# # #temp 
# for i in range(len(datasets)):
#     if not os.path.isdir('./Results/'+datasets[i]):
#         os.mkdir('./Results/'+datasets[i])
#     for classifier in classifiers:
#         if not os.path.isdir('./Results/'+datasets[i]+'/'+classifier):
#             os.mkdir('./Results/'+datasets[i]+'/'+classifier)
            


In [5]:
def res_to_files(dataset, classifier, dict_res):
    np.save('./Results/'+dataset+'/'+classifier+'/mcc.npy',dict_res['mcc'])
    np.save('./Results/'+dataset+'/'+classifier+'/auc.npy',dict_res['auc'])
    np.save('./Results/'+dataset+'/'+classifier+'/f1.npy',dict_res['f1'])
    np.save('./Results/'+dataset+'/'+classifier+'/gmean.npy',dict_res['gmean'])
    np.save('./Results/'+dataset+'/'+classifier+'/exe_time.npy',dict_res['exe_time'])
    np.save('./Results/'+dataset+'/'+classifier+'/y_pred.npy',dict_res['y_pred'])
    

In [37]:
# datasets

In [1]:
# # for classifier in classifiers:
# #     for dataset in datasets:
# #         if not os.path.exists('./Results/'+dataset+'/'+classifier+'/y_pred.npy'):
# #             print(classifier," => ", dataset, "=>", end=' ')
# #             # tmp_res = clasify(dataset, classifier)
# #             # print(tmp_res['auc'])
# #             try:
# #                 tmp_res = clasify(dataset, classifier)
# #                 res_to_files(dataset, classifier, tmp_res)
# #             except:
# #                 print(dataset, classifier)

# classifier = 'Ada1'
# for dataset in datasets:
#     if not os.path.exists('./Results/'+dataset+'/'+classifier+'/y_pred.npy'):
#         print(classifier," => ", dataset, "=>", end=' ')
#         # try:
#         tmp_res = clasify(dataset, classifier)
#         res_to_files(dataset, classifier, tmp_res)
#         # except:
#             # print(dataset, classifier)

# # classifier = 'AdaCost'
# # dataset = 'abalone19-5-'
# # if not os.path.exists('./Results/'+dataset+'/'+classifier+'/y_pred.npy'):
# #     print(classifier," => ", dataset, "=>", end=' ')
# #     tmp_res = clasify(dataset, classifier)
# #     res_to_files(dataset, classifier, tmp_res)
# #     print(tmp_res['auc'])

        

In [None]:
troubs = {}
for classifier in classifiers:
    for dataset in datasets:
        files = glob.glob('./Results/'+dataset+'/'+classifier+'/*.npy')
        if len(files) != 6:
            if dataset in troubs:
                troubs[dataset].append(classifier)
            else:
                troubs[dataset] = [classifier]
            print(dataset, classifier)
        
# print([dataset for dataset in troubs if len(troubs[dataset])==4])

In [12]:
# # convert categorical variables into numerical
# from sklearn.preprocessing import LabelEncoder
 
# le = LabelEncoder()
 
# # Using .fit_transform function to fit label
# # encoder and return encoded label

# base_path = 'Datasets/'
# need_to_convert = ['abalone19-5-', 'abalone19-5-fold']
# for needed in need_to_convert[1:]:
#     for datFile in glob.glob(base_path+needed+'/*.dat'):
#         print(datFile)
#         df = read_dot_dat_file(datFile)
#         label = le.fit_transform(df['Sex,'])        
#         df['Sex,'] = label
#         # df.to_excel(base_path+needed+'/'+datFile.split('/')[-1].split('.')[0]+'.xlsx')
#         break
        

In [11]:
# df = read_dot_dat_file(glob.glob(base_path+datasets[0]+'/*.dat')[0])
# df.to_excel('temp.xlsx')

In [10]:
# pd.read_excel('temp.xlsx').iloc[:, 0:-1]

In [None]:
# yeast-1-2-8-9_vs_7-5-fold RUS
# shuttle-c2-vs-c4-5-fold SMOTE
# ecoli-0-1-3-7_vs_2-6-5-fold SMOTE
# yeast4-5-fold Ada1
# abalone19-5- Ada1
# abalone19-5-fold Ada1
# abalone19-5- AdaCost
# abalone19-5-fold AdaCost