In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import sys
from scipy.io import arff
import models
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
 
path = '/Users/davidtigau/Documents/Courses/Research Topics in Data Mining/Assignments/Research Paper/datasets'
dataset_names = [f for f in listdir(path) if isfile(join(path, f))]

In [2]:
def preprocess_dataset(df):
    for column in list(df.columns):
        if str(df[column][1]).startswith('b'):
            unique = df[column].unique()
            df[column] = df[column].map({unique[0]: 0, unique[1]: 1})
    df = df.dropna()
    return df

In [3]:
target_columns = ["class", "Class", "class", "class", "contact-lenses", "Class", "class", "Contraceptive_method_used", "Class_Rings", "Type", "class", "class", "class", "class", "Class_Rings", "Class", "Survival_status", "type", "class"]

In [4]:
def load_datasets(path):

    datasets = []
    dataset_names = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.arff')]
    for i in range(len(dataset_names)):
        try:
            print(dataset_names[i])
            data = arff.loadarff(path+'/'+dataset_names[i])
            df = pd.DataFrame(data[0])
            df = preprocess_dataset(df)
            datasets.append(df)
        except:
            pass
    return datasets

In [5]:
datasets = load_datasets(path)

abalone.arff
credit-a.arff
soybean.arff
ionosphere.arff


In [6]:
def run_algorithms(df, i):
    df = df.dropna()
    target = df.columns[-1]
    
    print("__________________________________________")
    print("START Dataset: ", dataset_names[i])
    print("__________________________________________")
    
    #run sd
    # sd = models.sd(df, target)
    # print(sd.values())

    #cn2_sd
    print("CN2_SD")
    cn2_sd = models.cn2_sd(df, target)
    print(cn2_sd.values())
    print("__________________________________________")

    #run sd_map
    print("SD_MAP")
    sd_map = models.sd_map(df, target, min_support=0.1)
    print(sd_map)
    print("__________________________________________")

    #run dssd
    print("DSSD")
    dssd = models.dssd(df, target, min_support=0.1)
    print(dssd)
    print("__________________________________________")

    #run nmeef
    print("NMEEF-SD")
    nmeef = models.nmeef_sd(df, target)
    print(nmeef)
    print("__________________________________________")

    #run apriori
    print("Apriori-SD")
    a = models.apriori_sd(df, target)
    print(a)
    print("__________________________________________")
    print("END Dataset: ", dataset_names[i])
    print("__________________________________________")


In [7]:
for i in range(1, len(datasets)):
    run_algorithms(datasets[i], i)

__________________________________________
START Dataset:  abalone.arff
__________________________________________
CN2_SD
dict_values([-0.2137188937705454, 0.6628151260504203, 7, 3.0])
__________________________________________
SD_MAP
      support            itemsets   quality
53   0.102941           (A15, A9)  0.595588
17   0.125000            (A9, A1)  0.595588
7    0.323529                (A9)  0.550134
26   0.125000            (A9, A2)  0.536765
51   0.117647           (A9, A12)  0.533088
..        ...                 ...       ...
89   0.117647        (A8, A7, A3) -0.341912
67   0.117647       (A8, A11, A1) -0.341912
77   0.117647        (A8, A7, A2) -0.341912
113  0.117647  (A8, A15, A11, A3) -0.341912
106  0.125000      (A8, A15, A12) -0.345588

[114 rows x 3 columns]
__________________________________________
DSSD
{'Average Quality': -0.07407547571481456, 'Average Coverage': 18.666666666666668, 'Average Support': 0.13725490196078433, 'Number of Subgroups': 72, 'Average Length 

KeyboardInterrupt: 

In [None]:
target = datasets[1].columns[-1]
models.nmeef_sd(datasets[1], target)

In [None]:
# df_name = []
# lens = []
# wids = []
# meta_df = []
# for i in range(len(datasets)):
#     try:
#         data = arff.loadarff(path+'/'+dataset_names[i])
#         df = pd.DataFrame(data[0])
#         df = preprocess_dataset(df)
#         datasets.append(df)
#         df_name.append(dataset_names[i].replace(".arff", ''))
#         lens.append(df.shape[0])
#         wids.append(df.shape[1])
#         # print(df)
#         run_algorithms(df, i)
#     except:
#         problematic.append(dataset_names[i])

# feature_df = pd.DataFrame({'df_name':df_name, 'length':lens, 'width':wids, 'target_column':target_columns})