In [1]:
from os import listdir
from os.path import isfile, join
from pandas.core.common import SettingWithCopyWarning
import pandas as pd
import numpy as np
import sys
from scipy.io import arff
import models
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

path = '/home/zeyno/Desktop/Research_Topics/Datasets'

In [2]:
def preprocess_dataset(df):
    #num of columns with majority NaN values
    counter = 0
    for column in list(df.columns):
        #binarize the columns
        if str(df[column][1]).startswith('b'):
            unique = df[column].unique()
            df[column] = df[column].map({unique[0]: 0, unique[1]: 1})

        # eliminate columns with too many NaN values (optional)
        # if df[column].isna().sum()/len(df) > 0.85:
        #     print("% NaN values:" + str(df[column].isna().sum()/len(df)))
        #     df = df.drop([column])
    # df = df.dropna()
    return df

In [3]:
def target_type(df):
    df = df.dropna()
    last = df.columns[-1]
    target = df[last]
    diff = target.unique()
    if len(diff) <= 2:
        return "Binary"
    elif len(diff) > 2:
        return "Multinomial"
    return 0

In [4]:
def load_datasets(path):
    metadata_df = pd.DataFrame()
    num_cols = []
    num_rows = []
    datasets = {}
    names = [] #only the names of the datasets that are included 
    t_type = []
    dataset_names = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.arff')]
    for i in range(len(dataset_names)):
        try:
            data = arff.loadarff(path+'/'+dataset_names[i])
            df = pd.DataFrame(data[0])
            df = preprocess_dataset(df)
            name_df = dataset_names[i].replace(".arff", "")
            names.append(name_df)
            datasets[name_df] = df
            num_cols.append(df.shape[1])
            num_rows.append(df.shape[0])
            t_type.append(target_type(df))
        except:
            pass
    metadata_df["name_dataset"] = names
    metadata_df["num_columns"] = num_cols
    metadata_df["num_rows"] = num_rows
    metadata_df["target_type"] = t_type
    return datasets, metadata_df

In [5]:
datasets, metadata_df = load_datasets(path)
# metadata_df

In [8]:
models.nmeef_sd(datasets["tic-tac-toe"], datasets["tic-tac-toe"].columns[-1])

{'Average Quality': -0.3465553235908145,
 'Average Coverage': 0.017620041753653445,
 'Average Support': 0.0,
 'WRAcc': -0.006106319271621029,
 'Significance': 0.09156774731314314,
 'Confidence': 0.0,
 'Number of Subgroups': 50,
 'Average Length of Subgroups': 1.4}

In [6]:
def run_algorithms(df, name):
    target = df.columns[-1]
    results = {}
    
    print("__________________________________________")
    print("START Dataset: ", name)
    print("__________________________________________")
    
    # # run sd
    # sd = models.sd(df, target)
    # print(sd.values())
    # print("__________________________________________")

    #cn2_sd
    print("CN2_SD")
    cn2_sd = models.cn2_sd(df, target).values()
    print("__________________________________________")
    results["cn2_sd"] = list(cn2_sd)

    #run sd_map
    print("SD_MAP")
    sd_map = models.sd_map(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["sd_map"] = list(sd_map)

    #run dssd
    print("DSSD")
    dssd = models.dssd(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["dssd"] = list(dssd)

    #run nmeef
    print("NMEEF-SD")
    nmeef = models.nmeef_sd(df, target).values()
    print("__________________________________________")
    results["nnmeef"] = list(nmeef)

    #run apriori
    print("Apriori-SD")
    a = models.apriori_sd(df, target, min_threshold = 0.1).values()
    results["a"] = list(a)
    print("__________________________________________")
    print("END Dataset: ", name)
    print("__________________________________________")

    return results


In [7]:
meta_2 = {}
for key in datasets:
    if key in ["dermatology", "labor", "ionosphere", "adult"]:
        continue
    print(key)
    results = run_algorithms(datasets[key], key)
    meta_2[key] = results

tic-tac-toe
__________________________________________
START Dataset:  tic-tac-toe
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
NMEEF-SD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  tic-tac-toe
__________________________________________
iris
__________________________________________
START Dataset:  iris
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
NMEEF-SD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  iris
__________________________________________
hayes-roth_test
__________________________________________
START Dataset:  hayes-roth_test
________________________________

In [8]:
# Dictionary that contains dictionaries per dataset, and the scores per algorithm.
# For each algorithm result, the list represents the scores in these evaluation metrics:
# Quality, Coverage, Support, number of subgroups, average length of subgroups (i.e. hte number of rules used to represent a subgroup, on average)
meta_2

# Convert the nested dictionary into a DataFrame
df = pd.DataFrame.from_dict(meta_2, orient='index')
df = df.stack().apply(pd.Series).reset_index()

# Rename columns for clarity and reordering
df.columns = ['Dataset', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']

# Display the resulting DataFrame
print(df[['Dataset', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']])

        Dataset Algorithm       Quality    Coverage   Support  # of Subgroups  \
0   tic-tac-toe    cn2_sd -5.284570e-01  478.130435  0.499092            46.0   
1   tic-tac-toe    sd_map  8.627040e-02  199.928571  0.208694            14.0   
2   tic-tac-toe      dssd  1.134420e-01  126.000000  0.131524             9.0   
3   tic-tac-toe    nnmeef  5.334824e-07   22.040000  0.023006            50.0   
4   tic-tac-toe         a           NaN    0.383917  0.133612             5.0   
..          ...       ...           ...         ...       ...             ...   
70     credit-a    cn2_sd -2.165685e-01  387.028571  0.560911            35.0   
71     credit-a    sd_map -9.109338e-02  126.130841  0.182798           107.0   
72     credit-a      dssd -8.965963e-02   93.380282  0.135334            71.0   
73     credit-a    nnmeef  2.373177e-04    1.800000  0.002609            50.0   
74     credit-a         a           NaN         NaN       NaN             0.0   

    Length of Rules  
0    

In [9]:
df.to_csv("metadata.csv", index=False)