In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
# from pandas.core.common import SettingWithCopyWarning
import numpy as np
import sys
from scipy.io import arff
import models
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

path = '/Users/davidtigau/Documents/Courses/Research Topics in Data Mining/Assignments/Research Paper/datasets'

In [2]:
def preprocess_dataset(df):
    #num of columns with majority NaN values
    counter = 0
    for column in list(df.columns):
        #binarize the columns
        if str(df[column][1]).startswith('b'):
            unique = df[column].unique()
            df[column] = df[column].map({unique[0]: 0, unique[1]: 1})

        # eliminate columns with too many NaN values (optional)
        # if df[column].isna().sum()/len(df) > 0.85:
        #     print("% NaN values:" + str(df[column].isna().sum()/len(df)))
        #     df = df.drop([column])
    # df = df.dropna()
    return df

In [3]:
def target_type(df):
    df = df.dropna()
    last = df.columns[-1]
    target = df[last]
    diff = target.unique()
    if len(diff) <= 2:
        return "Binary"
    elif len(diff) > 2:
        return "Multinomial"
    return 0

In [4]:
def load_datasets(path):
    metadata_df = pd.DataFrame()
    num_cols = []
    num_rows = []
    datasets = {}
    names = [] #only the names of the datasets that are included 
    t_type = []
    dataset_names = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.arff')]
    for i in range(len(dataset_names)):
        try:
            data = arff.loadarff(path+'/'+dataset_names[i])
            df = pd.DataFrame(data[0])
            df = preprocess_dataset(df)
            name_df = dataset_names[i].replace(".arff", "")
            names.append(name_df)
            datasets[name_df] = df
            num_cols.append(df.shape[1])
            num_rows.append(df.shape[0])
            t_type.append(target_type(df))
        except:
            pass
    metadata_df["name_dataset"] = names
    metadata_df["num_columns"] = num_cols
    metadata_df["num_rows"] = num_rows
    metadata_df["target_type"] = t_type
    return datasets, metadata_df

In [5]:
datasets, metadata_df = load_datasets(path)
# metadata_df

In [6]:
models.nmeef_sd(datasets["tic-tac-toe"], datasets["tic-tac-toe"].columns[-1])

{'Average Quality': -0.34655532359081415,
 'Average Coverage': 0.0010438413361169101,
 'Average Support': 0.0,
 'WRAcc': -0.00036174877201546366,
 'Significance': 0.5483384661792094,
 'Confidence': 0.0,
 'Number of Subgroups': 50,
 'Average Length of Subgroups': 0.4}

In [7]:
def run_algorithms(df, name):
    target = df.columns[-1]
    results = {}
    
    print("__________________________________________")
    print("START Dataset: ", name)
    print("__________________________________________")
    
    # # run sd
    # sd = models.sd(df, target)
    # print(sd.values())
    # print("__________________________________________")

    #cn2_sd
    print("CN2_SD")
    cn2_sd = models.cn2_sd(df, target).values()
    print("__________________________________________")
    results["cn2_sd"] = list(cn2_sd)

    #run sd_map
    print("SD_MAP")
    sd_map = models.sd_map(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["sd_map"] = list(sd_map)

    #run dssd
    print("DSSD")
    dssd = models.dssd(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["dssd"] = list(dssd)

    #run nmeef
    print("NMEEF-SD")
    nmeef = models.nmeef_sd(df, target).values()
    print("__________________________________________")
    results["nnmeef"] = list(nmeef)

    #run apriori
    print("Apriori-SD")
    a = models.apriori_sd(df, target, min_threshold = 0.1).values()
    results["a"] = list(a)
    print("__________________________________________")
    print("END Dataset: ", name)
    print("__________________________________________")

    return results


In [8]:
meta_2 = {}
for key in datasets:
    if key in ["dermatology", "labor", "ionosphere", "adult"]:
        continue
    print(key)
    results = run_algorithms(datasets[key], key)
    meta_2[key] = results

hayes-roth_test
__________________________________________
START Dataset:  hayes-roth_test
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
NMEEF-SD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  hayes-roth_test
__________________________________________
abalone
__________________________________________
START Dataset:  abalone
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
NMEEF-SD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  abalone
__________________________________________
haberman
__________________________________________
START Dataset:  haberman
_________________________

In [9]:
# Dictionary that contains dictionaries per dataset, and the scores per algorithm.
# For each algorithm result, the list represents the scores in these evaluation metrics:
# Quality, Coverage, Support, number of subgroups, average length of subgroups (i.e. hte number of rules used to represent a subgroup, on average)
meta_2

# Convert the nested dictionary into a DataFrame
df = pd.DataFrame.from_dict(meta_2, orient='index')
df = df.stack().apply(pd.Series).reset_index()

# Rename columns for clarity and reordering
df.columns = ['Dataset', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']

# Display the resulting DataFrame
print(df[['Dataset', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']])

             Dataset Algorithm   Quality  Coverage   Support     WRAcc  \
0    hayes-roth_test    cn2_sd -0.958653  0.660714  0.250000 -0.056760   
1    hayes-roth_test    sd_map -0.092593  0.250000  0.089286 -0.010204   
2    hayes-roth_test      dssd -0.092593  0.250000  0.089286 -0.010204   
3    hayes-roth_test    nnmeef -0.236481  0.141429  0.035000 -0.030332   
4    hayes-roth_test         a       NaN  0.400000  0.142857       NaN   
..               ...       ...       ...       ...       ...       ...   
70  hayes-roth_train    cn2_sd -0.652623  0.257576  0.101010  0.001492   
71  hayes-roth_train    sd_map  0.000000  0.265152  0.077652  0.017648   
72  hayes-roth_train      dssd  0.000000  0.265152  0.077652  0.017648   
73  hayes-roth_train    nnmeef -0.157143  0.041667  0.009091 -0.000861   
74  hayes-roth_train         a       NaN  0.386364  0.128788       NaN   

    Significance  Confidence  # of Subgroups  Length of Rules  
0       0.000213    0.296296             2.0   

In [10]:
df.to_csv("metadata.csv", index=False)

In [19]:


# --- DATA LOADING AND PREPARATION ---

# Load the metadata
metadata_df = pd.read_csv("metadata.csv")

# Convert continuous columns 'Quality' and 'WRAcc' into discrete bins
def bin_columns(df):
    columns_to_bin = {
        'Quality': 'Binned_Quality',
        'WRAcc': 'Binned_WRAcc'
    }
    for col, binned_col in columns_to_bin.items():
        bins = [-float('inf'), df[col].quantile(0.33), df[col].quantile(0.67), float('inf')]
        labels = ['Low', 'Medium', 'High']
        df[binned_col] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    return df

metadata_df = bin_columns(metadata_df)

# --- TRANSACTIONAL FORMAT CONVERSION ---

# Convert dataset to a transactional format suitable for Apriori
def convert_to_transactional_format(df):
    transactions = []
    for _, row in df.iterrows():
        transaction = [f"{col}={value}" for col, value in row.items() if not pd.isna(value)]
        transactions.append(transaction)
    return transactions

selected_columns = ['Dataset', 'Algorithm', 'Binned_Quality', 'Binned_WRAcc']
transactional_data = convert_to_transactional_format(metadata_df[selected_columns].dropna())

# --- APRIORI FOR FREQUENT ITEMSETS ---

# Generate frequent itemsets using the Apriori algorithm
def apriori(transactions, min_support=0.1):
    # Calculate item frequencies
    item_freq = {}
    for transaction in transactions:
        for item in transaction:
            item_freq[item] = item_freq.get(item, 0) + 1

    # Filter items by minimum support
    items = [item for item, freq in item_freq.items() if freq / len(transactions) >= min_support]

    # Generate candidate itemsets
    def get_candidates(itemset, length):
        return set([i.union(j) for i in itemset for j in itemset if len(i.union(j)) == length])

    # Get frequent itemsets
    current_set = [frozenset([item]) for item in items]
    frequent_itemsets = []
    k = 2
    while current_set:
        valid_sets = []
        for itemset in current_set:
            count = sum(1 for transaction in transactions if itemset.issubset(transaction))
            if count / len(transactions) >= min_support:
                valid_sets.append(itemset)
                frequent_itemsets.append(itemset)
        current_set = get_candidates(valid_sets, k)
        k += 1

    return frequent_itemsets

frequent_itemsets = apriori(transactional_data, min_support=0.1)

# --- SUBGROUP DISCOVERY ---

# Identify and rank subgroups associated with a target value (e.g., 'High Quality')
def get_subgroups_for_target(target_item, frequent_itemsets, transactions):
    subgroups = [itemset for itemset in frequent_itemsets if target_item in itemset]
    subgroup_supports = [
        (itemset, sum(1 for transaction in transactions if itemset.issubset(transaction)) / len(transactions))
        for itemset in subgroups
    ]
    ranked_subgroups = sorted(subgroup_supports, key=lambda x: x[1], reverse=True)
    return ranked_subgroups

target_item = 'Binned_Quality=High'
high_quality_subgroups = get_subgroups_for_target(target_item, frequent_itemsets, transactional_data)


# --- PRINT RESULTS ---

print("Ranked Subgroups Associated with 'High Quality':\n")
for rank, (itemset, support) in enumerate(high_quality_subgroups, start=1):
    print(f"Rank {rank}:")
    print("Subgroup:", ', '.join(itemset))
    print(f"Support: {support:.2%}\n")


Ranked Subgroups Associated with 'High Quality':

Rank 1:
Subgroup: Binned_Quality=High
Support: 32.76%

Rank 2:
Subgroup: Binned_Quality=High, Binned_WRAcc=High
Support: 27.59%

Rank 3:
Subgroup: Binned_Quality=High, Algorithm=dssd
Support: 17.24%

Rank 4:
Subgroup: Algorithm=sd_map, Binned_Quality=High
Support: 15.52%

Rank 5:
Subgroup: Algorithm=sd_map, Binned_Quality=High, Binned_WRAcc=High
Support: 13.79%

Rank 6:
Subgroup: Binned_Quality=High, Binned_WRAcc=High, Algorithm=dssd
Support: 13.79%
