In [1]:
from os import listdir
from os.path import isfile, join
from pandas.core.common import SettingWithCopyWarning
import pandas as pd
import numpy as np
import sys
from scipy.io import arff
import models
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
 
path = '/home/zeyno/Desktop/Research_Topics/Datasets'

In [2]:
def preprocess_dataset(df):
    #num of columns with majority NaN values
    counter = 0
    for column in list(df.columns):
        #binarize the columns
        if str(df[column][1]).startswith('b'):
            unique = df[column].unique()
            df[column] = df[column].map({unique[0]: 0, unique[1]: 1})
        
        # eliminate columns with too many NaN values (optional)
        # if df[column].isna().sum()/len(df) > 0.85:
        #     print("% NaN values:" + str(df[column].isna().sum()/len(df)))
        #     df = df.drop([column])
    # df = df.dropna()
    return df

In [3]:
def target_type(df):
    df = df.dropna()
    last = df.columns[-1]
    target = df[last]
    diff = target.unique()
    if len(diff) <= 2:
        return "Binary"
    elif len(diff) > 2:
        return "Multinomial"
    return 0

In [4]:
def load_datasets(path):
    metadata_df = pd.DataFrame()
    num_cols = []
    num_rows = []
    datasets = {}
    names = [] #only the names of the datasets that are included 
    t_type = []
    dataset_names = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.arff')]
    for i in range(len(dataset_names)):
        try:
            data = arff.loadarff(path+'/'+dataset_names[i])
            df = pd.DataFrame(data[0])
            df = preprocess_dataset(df)
            name_df = dataset_names[i].replace(".arff", "")
            names.append(name_df)
            datasets[name_df] = df
            num_cols.append(df.shape[1])
            num_rows.append(df.shape[0])
            t_type.append(target_type(df))
        except:
            pass
    metadata_df["name_dataset"] = names
    metadata_df["num_columns"] = num_cols
    metadata_df["num_rows"] = num_rows
    metadata_df["target_type"] = t_type
    return datasets, metadata_df

In [5]:
datasets, metadata_df = load_datasets(path)
metadata_df

Unnamed: 0,name_dataset,num_columns,num_rows,target_type
0,dermatology,35,366,Binary
1,tic-tac-toe,10,958,Binary
2,iris,5,150,Binary
3,hayes-roth_test,5,28,Binary
4,contact-lenses,5,24,Binary
5,adult,15,48842,Binary
6,wisconsin,10,699,Binary
7,cmc,10,1473,Binary
8,hayes-roth_train,5,132,Binary
9,glass,10,214,Binary


In [6]:
datasets

{'dermatology':      erythema  scaling  definite_borders  itching  koebner_phenomenon  \
 0         0.0      0.0               0.0      0.0                 0.0   
 1         1.0      1.0               1.0      1.0                 1.0   
 2         0.0      NaN               NaN      0.0                 1.0   
 3         0.0      0.0               NaN      NaN                 0.0   
 4         0.0      1.0               NaN      1.0                 NaN   
 ..        ...      ...               ...      ...                 ...   
 361       0.0      NaN               NaN      NaN                 1.0   
 362       1.0      0.0               NaN      NaN                 1.0   
 363       1.0      0.0               NaN      1.0                 NaN   
 364       0.0      NaN               1.0      NaN                 NaN   
 365       1.0      0.0               NaN      NaN                 0.0   
 
      polygonal_papules  follicular_papules  oral_mucosal_involvement  \
 0                  0.

In [11]:
def run_algorithms(df, name):
    df = df.dropna()
    target = df.columns[-1]
    results = {}
    
    print("__________________________________________")
    print("START Dataset: ", name)
    print("__________________________________________")
    
    # # run sd
    # sd = models.sd(df, target)
    # print(sd.values())
    # print("__________________________________________")

    #cn2_sd
    print("CN2_SD")
    cn2_sd = models.cn2_sd(df, target).values()
    print("__________________________________________")
    results["cn2_sd"] = list(cn2_sd)

    #run sd_map
    print("SD_MAP")
    sd_map = models.sd_map(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["sd_map"] = list(sd_map)

    #run dssd
    print("DSSD")
    dssd = models.dssd(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["dssd"] = list(dssd)

    #run nmeef || havent added the evaluation metrics yet
    # print("NMEEF-SD")
    # nmeef = models.nmeef_sd(df, target)
    # print(nmeef)
    # print("__________________________________________")

    #run apriori
    print("Apriori-SD")
    a = models.apriori_sd(df, target, min_threshold = 0.1).values()
    results["a"] = list(a)
    print("__________________________________________")
    print("END Dataset: ", name)
    print("__________________________________________")

    return results


In [12]:
meta_2 = {}
for key in datasets:
    if key in ["dermatology", "labor", "ionosphere"]:
        continue
    print(key)
    results = run_algorithms(datasets[key], key)
    meta_2[key] = results

tic-tac-toe
__________________________________________
START Dataset:  tic-tac-toe
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
END Dataset:  tic-tac-toe
__________________________________________
iris
__________________________________________
START Dataset:  iris
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  iris
__________________________________________
hayes-roth_test
__________________________________________
START Dataset:  hayes-roth_test
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  hayes-roth_test
__________________________________________
contact-lenses
__________________________________________
START Dataset:  contact-lenses
______________

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  hayes-roth_train
__________________________________________
glass
__________________________________________
START Dataset:  glass
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  glass
__________________________________________
car
__________________________________________
START Dataset:  car
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
END Dataset:  car
__________________________________________
pima-indians
__________________________________________
START Dataset:  pima-indians
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  pima-indians
__________________________________________
abalone
__________________________________________
START Dataset:  abalone
__________________________________________
CN2_SD


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  abalone
__________________________________________
balance-scale
__________________________________________
START Dataset:  balance-scale
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
END Dataset:  balance-scale
__________________________________________
haberman
__________________________________________
START Dataset:  haberman
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  haberman
__________________________________________
zoo
__________________________________________
START Dataset:  zoo
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


__________________________________________
END Dataset:  zoo
__________________________________________
credit-a
__________________________________________
START Dataset:  credit-a
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  credit-a
__________________________________________


In [13]:
# Dictionary that contains dictionaries per dataset, and the scores per algorithm.
# For each algorithm result, the list represents the scores in these evaluation metrics:
# Quality, Coverage, Support, number of subgroups, average length of subgroups (i.e. hte number of rules used to represent a subgroup, on average)
meta_2

{'tic-tac-toe': {'cn2_sd': [-0.9204798598230077, 0.8632478632478633, 3, 1.0],
  'sd_map': [-0.01905640798829651,
   8.263157894736842,
   0.21187584345479082,
   19,
   0],
  'dssd': [-0.028846153846153806,
   5.230769230769231,
   0.1341222879684418,
   13,
   0],
  'a': [0, nan, nan, 0, nan]},
 'iris': {'cn2_sd': [-0.25, 0.7375, 4, 2.0],
  'sd_map': [0.35259375545089827,
   43.77777777777778,
   0.43777777777777777,
   9,
   0],
  'dssd': [0.4450549450549451, 42.142857142857146, 0.4214285714285714, 7, 0],
  'a': [0, 0.9854227405247814, 0.4657142857142857, 7, 1.7142857142857142]},
 'hayes-roth_test': {'cn2_sd': [-0.9586530524674854, 1.0, 2, 1.0],
  'sd_map': [-0.09259259259259257, 6.0, 0.22222222222222224, 6, 0],
  'dssd': [-0.09259259259259257, 6.0, 0.22222222222222224, 6, 0],
  'a': [0, 0.4444444444444444, 0.14814814814814814, 3, 1.0]},
 'contact-lenses': {'cn2_sd': [-0.8904916402194913, 1.0, 1, 1.0],
  'sd_map': [0.09230769230769231, 5.0, 0.38461538461538464, 2, 0],
  'dssd': [0.09