In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
# from pandas.core.common import SettingWithCopyWarning
import numpy as np
import sys
from scipy.io import arff
import models
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

path = '/home/zeyno/Desktop/Research_Topics/Datasets'

In [2]:
def preprocess_dataset(df):
    #num of columns with majority NaN values
    counter = 0
    for column in list(df.columns):
        #binarize the columns
        if str(df[column][1]).startswith('b'):
            unique = df[column].unique()
            df[column] = df[column].map({unique[0]: 0, unique[1]: 1})

        # eliminate columns with too many NaN values (optional)
        if df[column].isna().sum()/len(df) > 0.80:
            print("% NaN values:" + str(df[column].isna().sum()/len(df)))
            df = df.drop([column])
    df = df.dropna()
    return df

In [3]:
def target_type(df):
    df = df.dropna()
    last = df.columns[-1]
    target = df[last]
    diff = target.unique()
    if len(diff) <= 2:
        return "Binary"
    elif len(diff) > 2:
        return "Multinomial"
    return 0

In [6]:
def load_datasets(path):
    metadata_df = pd.DataFrame()
    num_cols = []
    num_rows = []
    datasets = {}
    names = [] #only the names of the datasets that are included 
    t_type = []
    dataset_names = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.arff')]
    for i in range(len(dataset_names)):
        try:
            data = arff.loadarff(path+'/'+dataset_names[i])
            df = pd.DataFrame(data[0])
            df = preprocess_dataset(df)
            name_df = dataset_names[i].replace(".arff", "")
            names.append(name_df)
            datasets[name_df] = df
            num_cols.append(df.shape[1])
            num_rows.append(df.shape[0])
            t_type.append(target_type(df))
        except:
            pass
    metadata_df["Datasets"] = names
    metadata_df["num_columns"] = num_cols
    metadata_df["num_rows"] = num_rows
    metadata_df["target_type"] = t_type
    return datasets, metadata_df

In [7]:
datasets, metadata_df = load_datasets(path)
metadata_df

% NaN values:0.9192516001969473
% NaN values:0.880369354244298
% NaN values:0.8421052631578947
% NaN values:0.8235294117647058
% NaN values:0.9801980198019802


Unnamed: 0,Datasets,num_columns,num_rows,target_type
0,dermatology,35,3,Binary
1,tic-tac-toe,10,39,Binary
2,iris,5,100,Binary
3,hayes-roth_test,5,27,Binary
4,contact-lenses,5,13,Binary
5,wisconsin,10,683,Binary
6,cmc,10,101,Binary
7,hayes-roth_train,5,102,Binary
8,glass,10,87,Binary
9,ionosphere,35,351,Binary


In [22]:
models.sd(datasets["tic-tac-toe"], datasets["tic-tac-toe"].columns[-1])

middle-middle-square <= 0.50
middle-right-square <= 0.50
len sub: 0
middle-middle-square > 0.50
middle-right-square > 0.50
len sub: 10
middle-middle-square <= 0.50
middle-right-square > 0.50
len sub: 0
middle-middle-square > 0.50
middle-right-square <= 0.50
len sub: 0
{'Average Quality': -0.5384615384615384, 'Average Coverage': 0.0641025641025641, 'Average Support': 0.0, 'Average WRAcc': -0.03451676528599605, 'Average Significance': nan, 'Average Confidence': 0.0, 'Number of Subgroups': 4, 'Average Length of Subgroups': 2.0}


{'Average Quality': -0.5384615384615384,
 'Average Coverage': 0.0641025641025641,
 'Average Support': 0.0,
 'Average WRAcc': -0.03451676528599605,
 'Average Significance': nan,
 'Average Confidence': 0.0,
 'Number of Subgroups': 4,
 'Average Length of Subgroups': 2.0}

In [8]:
def run_algorithms(df, name):
    target = df.columns[-1]
    results = {}
    
    print("__________________________________________")
    print("START Dataset: ", name)
    print("__________________________________________")
    
    # run sd
    sd = models.sd(df, target)
    sd = sd.values()
    print("__________________________________________")
    results["sd"] = list(sd)

    #cn2_sd
    print("CN2_SD")
    cn2_sd = models.cn2_sd(df, target).values()
    print("__________________________________________")
    results["cn2_sd"] = list(cn2_sd)

    #run sd_map
    print("SD_MAP")
    sd_map = models.sd_map(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["sd_map"] = list(sd_map)

    #run dssd
    print("DSSD")
    dssd = models.dssd(df, target, min_support=0.1).values()
    print("__________________________________________")
    results["dssd"] = list(dssd)

    #run nmeef
    print("NMEEF-SD")
    nmeef = models.nmeef_sd(df, target).values()
    print("__________________________________________")
    results["nnmeef"] = list(nmeef)

    #run apriori
    print("Apriori-SD")
    a = models.apriori_sd(df, target, min_threshold = 0.1).values()
    results["a"] = list(a)
    print("__________________________________________")
    print("END Dataset: ", name)
    print("__________________________________________")

    return results


In [9]:
meta_2 = {}
for key in datasets:
    if key in ["dermatology", "labor", "ionosphere", "adult"]:
        continue
    print(key)
    results = run_algorithms(datasets[key], key)
    meta_2[key] = results

tic-tac-toe
__________________________________________
START Dataset:  tic-tac-toe
__________________________________________
middle-middle-square <= 0.50
middle-right-square <= 0.50
len sub: 0
middle-middle-square > 0.50
middle-right-square > 0.50
len sub: 10
middle-middle-square <= 0.50
middle-right-square > 0.50
len sub: 0
middle-middle-square > 0.50
middle-right-square <= 0.50
len sub: 0
{'Average Quality': -0.5384615384615384, 'Average Coverage': 0.0641025641025641, 'Average Support': 0.0, 'Average WRAcc': -0.03451676528599605, 'Average Significance': nan, 'Average Confidence': 0.0, 'Number of Subgroups': 4, 'Average Length of Subgroups': 2.0}
__________________________________________
CN2_SD
__________________________________________
SD_MAP
__________________________________________
DSSD
__________________________________________
NMEEF-SD
__________________________________________
Apriori-SD
__________________________________________
END Dataset:  tic-tac-toe
____________________

In [11]:
# Dictionary that contains dictionaries per dataset, and the scores per algorithm.
# For each algorithm result, the list represents the scores in these evaluation metrics:
# Quality, Coverage, Support, number of subgroups, average length of subgroups (i.e. hte number of rules used to represent a subgroup, on average)
meta_2

# Convert the nested dictionary into a DataFrame
df = pd.DataFrame.from_dict(meta_2, orient='index')
df = df.stack().apply(pd.Series).reset_index()

# Rename columns for clarity and reordering
df.columns = ['Datasets', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']

# Display the resulting DataFrame
print(df[['Datasets', 'Algorithm', 'Quality', 'Coverage', 'Support', 'WRAcc', "Significance", "Confidence", '# of Subgroups', 'Length of Rules']])

       Datasets Algorithm   Quality  Coverage   Support     WRAcc  \
0   tic-tac-toe        sd -0.538462  0.064103  0.000000 -0.034517   
1   tic-tac-toe    cn2_sd -0.920480  0.760684  0.384615 -0.024984   
2   tic-tac-toe    sd_map -0.019056  0.211876  0.112011 -0.002076   
3   tic-tac-toe      dssd -0.028846  0.134122  0.067061 -0.005159   
4   tic-tac-toe    nnmeef -0.538462  0.032821  0.000000 -0.017673   
..          ...       ...       ...       ...       ...       ...   
73     credit-a    cn2_sd -0.213719  0.199580  0.066176 -0.014536   
74     credit-a    sd_map -0.086630  0.190789  0.062113 -0.015045   
75     credit-a      dssd -0.074075  0.137255  0.046773 -0.008735   
76     credit-a    nnmeef  0.595588  0.014706  0.014706  0.008759   
77     credit-a         a       NaN  0.642863  0.161765       NaN   

    Significance  Confidence  # of Subgroups  Length of Rules  
0            NaN    0.000000             4.0         2.000000  
1   2.474737e-01    0.495107             3.

In [13]:
temp = pd.merge(metadata_df, df, on=["Datasets"])
print(temp.head())
df.to_csv("metadata.csv", index=False)

      Datasets  num_columns  num_rows target_type Algorithm   Quality  \
0  tic-tac-toe           10        39      Binary        sd -0.538462   
1  tic-tac-toe           10        39      Binary    cn2_sd -0.920480   
2  tic-tac-toe           10        39      Binary    sd_map -0.019056   
3  tic-tac-toe           10        39      Binary      dssd -0.028846   
4  tic-tac-toe           10        39      Binary    nnmeef -0.538462   

   Coverage   Support     WRAcc  Significance  Confidence  # of Subgroups  \
0  0.064103  0.000000 -0.034517           NaN    0.000000             4.0   
1  0.760684  0.384615 -0.024984      0.247474    0.495107             3.0   
2  0.211876  0.112011 -0.002076      0.473678    0.519405            19.0   
3  0.134122  0.067061 -0.005159      0.518557    0.509615            13.0   
4  0.032821  0.000000 -0.017673      0.303342    0.000000            50.0   

   Length of Rules  
0         2.000000  
1         1.000000  
2         1.684211  
3         1.68

In [15]:
# --- DATA LOADING AND PREPARATION ---

# Load the metadata
metadata_df = pd.read_csv("metadata.csv")

# Convert continuous columns 'Quality' and 'WRAcc' into discrete bins
def bin_columns(df):
    columns_to_bin = {
        'Quality': 'Binned_Quality',
        'WRAcc': 'Binned_WRAcc',
        'Coverage': 'Binned Coverage',
        'Support': 'Binned Support',
        'Significance': 'Binned Significance',
        'Confidence': 'Binned Confidence',
    }
    for col, binned_col in columns_to_bin.items():
        bins = [-float('inf'), df[col].quantile(0.33), df[col].quantile(0.67), float('inf')]
        labels = ['Low', 'Medium', 'High']
        df[binned_col] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    return df

metadata_df = bin_columns(metadata_df)

# --- TRANSACTIONAL FORMAT CONVERSION ---

# Convert dataset to a transactional format suitable for Apriori
def convert_to_transactional_format(df):
    transactions = []
    for _, row in df.iterrows():
        transaction = [f"{col}={value}" for col, value in row.items() if not pd.isna(value)]
        transactions.append(transaction)
    return transactions

selected_columns = ['Datasets', 'Algorithm', 'Binned_Quality', 'Binned Coverage', 'Binned_WRAcc', 'Binned Support', 'Binned Significance', 'Binned Confidence',]
transactional_data = convert_to_transactional_format(metadata_df[selected_columns].dropna())

# --- APRIORI FOR FREQUENT ITEMSETS ---

# Generate frequent itemsets using the Apriori algorithm
def apriori(transactions, min_support=0.1):
    # Calculate item frequencies
    item_freq = {}
    for transaction in transactions:
        for item in transaction:
            item_freq[item] = item_freq.get(item, 0) + 1

    # Filter items by minimum support
    items = [item for item, freq in item_freq.items() if freq / len(transactions) >= min_support]

    # Generate candidate itemsets
    def get_candidates(itemset, length):
        return set([i.union(j) for i in itemset for j in itemset if len(i.union(j)) == length])

    # Get frequent itemsets
    current_set = [frozenset([item]) for item in items]
    frequent_itemsets = []
    k = 2
    while current_set:
        valid_sets = []
        for itemset in current_set:
            count = sum(1 for transaction in transactions if itemset.issubset(transaction))
            if count / len(transactions) >= min_support:
                valid_sets.append(itemset)
                frequent_itemsets.append(itemset)
        current_set = get_candidates(valid_sets, k)
        k += 1

    return frequent_itemsets

frequent_itemsets = apriori(transactional_data, min_support=0.1)

# --- SUBGROUP DISCOVERY ---

# Identify and rank subgroups associated with a target value (e.g., 'High Quality')
def get_subgroups_for_target(target_item, frequent_itemsets, transactions):
    subgroups = [itemset for itemset in frequent_itemsets if target_item in itemset]
    subgroup_supports = [
        (itemset, sum(1 for transaction in transactions if itemset.issubset(transaction)) / len(transactions))
        for itemset in subgroups
    ]
    ranked_subgroups = sorted(subgroup_supports, key=lambda x: x[1], reverse=True)
    return ranked_subgroups

target_item = 'Algorithm=cn2_sd'
high_quality_subgroups = get_subgroups_for_target(target_item, frequent_itemsets, transactional_data)


# --- PRINT RESULTS ---

print("Ranked Subgroups Associated with "+ target_item + "\n")
for rank, (itemset, support) in enumerate(high_quality_subgroups, start=1):
    print(f"Rank {rank}:")
    print("Subgroup:", ', '.join(itemset))
    print(f"Support: {support:.2%}\n")


Ranked Subgroups Associated with Algorithm=cn2_sd

Rank 1:
Subgroup: Algorithm=cn2_sd
Support: 22.00%

Rank 2:
Subgroup: Binned Significance=Low, Algorithm=cn2_sd
Support: 16.00%

Rank 3:
Subgroup: Binned_WRAcc=Low, Algorithm=cn2_sd
Support: 12.00%

Rank 4:
Subgroup: Algorithm=cn2_sd, Binned Confidence=Medium
Support: 12.00%

Rank 5:
Subgroup: Algorithm=cn2_sd, Binned Coverage=High
Support: 12.00%

Rank 6:
Subgroup: Algorithm=cn2_sd, Binned_Quality=Low
Support: 12.00%

Rank 7:
Subgroup: Algorithm=cn2_sd, Binned Support=Medium
Support: 10.00%

Rank 8:
Subgroup: Binned_WRAcc=Low, Binned Significance=Low, Algorithm=cn2_sd
Support: 10.00%

Rank 9:
Subgroup: Binned Significance=Low, Algorithm=cn2_sd, Binned Confidence=Medium
Support: 10.00%



In [26]:
metadata_df = bin_columns(metadata_df)
selected_columns = ['Datasets', 'Algorithm', 'Binned_Quality', 'Binned Coverage', 'Binned_WRAcc', 'Binned Support', 'Binned Significance', 'Binned Confidence',]
transactional_data = convert_to_transactional_format(metadata_df[selected_columns].dropna())
frequent_itemsets = apriori(transactional_data, min_support=0.1)

algorithms = list(metadata_df["Algorithm"].unique())

def meta_subgroup():
    results = []
    for algorithm in algorithms:
        target_item = 'Algorithm=' + algorithm
        high_quality_subgroups = get_subgroups_for_target(target_item, frequent_itemsets, transactional_data)

        print("Ranked Subgroups Associated with "+ target_item + "\n")
        for rank, (itemset, support) in enumerate(high_quality_subgroups, start=1):
            results.append((target_item, rank, itemset, support))
            print(f"Rank {rank}:")
            print("Subgroup:", ', '.join(itemset))
            print(f"Support: {support:.2%}\n")
    
    meta_results = pd.DataFrame.from_records(results, columns=["Algorithm", "Rank", "Subgroup", "Support"])
    return meta_results


In [29]:
results = meta_subgroup()
results.to_csv("meta_results.csv", index=False)
results

Ranked Subgroups Associated with Algorithm=sd

Ranked Subgroups Associated with Algorithm=cn2_sd

Rank 1:
Subgroup: Algorithm=cn2_sd
Support: 22.00%

Rank 2:
Subgroup: Binned Significance=Low, Algorithm=cn2_sd
Support: 16.00%

Rank 3:
Subgroup: Binned_WRAcc=Low, Algorithm=cn2_sd
Support: 12.00%

Rank 4:
Subgroup: Algorithm=cn2_sd, Binned Confidence=Medium
Support: 12.00%

Rank 5:
Subgroup: Algorithm=cn2_sd, Binned Coverage=High
Support: 12.00%

Rank 6:
Subgroup: Algorithm=cn2_sd, Binned_Quality=Low
Support: 12.00%

Rank 7:
Subgroup: Algorithm=cn2_sd, Binned Support=Medium
Support: 10.00%

Rank 8:
Subgroup: Binned_WRAcc=Low, Binned Significance=Low, Algorithm=cn2_sd
Support: 10.00%

Rank 9:
Subgroup: Binned Significance=Low, Algorithm=cn2_sd, Binned Confidence=Medium
Support: 10.00%

Ranked Subgroups Associated with Algorithm=sd_map

Rank 1:
Subgroup: Algorithm=sd_map
Support: 26.00%

Rank 2:
Subgroup: Algorithm=sd_map, Binned Coverage=Medium
Support: 18.00%

Rank 3:
Subgroup: Algorithm

Unnamed: 0,Algorithm,Rank,Subgroup,Support
0,Algorithm=cn2_sd,1,(Algorithm=cn2_sd),0.22
1,Algorithm=cn2_sd,2,"(Binned Significance=Low, Algorithm=cn2_sd)",0.16
2,Algorithm=cn2_sd,3,"(Binned_WRAcc=Low, Algorithm=cn2_sd)",0.12
3,Algorithm=cn2_sd,4,"(Algorithm=cn2_sd, Binned Confidence=Medium)",0.12
4,Algorithm=cn2_sd,5,"(Algorithm=cn2_sd, Binned Coverage=High)",0.12
...,...,...,...,...
78,Algorithm=nnmeef,28,"(Binned Support=Low, Binned_Quality=Low, Algor...",0.10
79,Algorithm=nnmeef,29,"(Binned Support=Low, Binned Confidence=Low, Al...",0.10
80,Algorithm=nnmeef,30,"(Binned Support=Low, Binned_WRAcc=Medium, Algo...",0.10
81,Algorithm=nnmeef,31,"(Binned Confidence=Low, Binned_WRAcc=Medium, A...",0.10
