In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import os

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import KFold
from functools import cmp_to_key
import sys

INPUT = "datasets"
OUTPUT = "part1_results"
if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)

SEED = 42

# Import datasets

## Preprocessing Methods

In [2]:
# preprocessing method

# Change categorical data to numerical data
def cat_2_num(df:pd.DataFrame):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].astype('category')
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

# Use KMeans to get discretized data
def get_discretization_data(df:pd.DataFrame):
    for col_name in df.columns:
        if(len(pd.unique(df[col_name])) <= 5):
            return df
        k = 5
        k_model = KMeans(n_clusters=k)
        k_model.fit(df[col_name].values.reshape(len(df[col_name]), 1))
        c = pd.DataFrame(k_model.cluster_centers_, columns = list("a")).sort_values(by = "a")
        w = c.rolling(2).mean().iloc[1:]
        w = np.asarray(w.values)
        w = [i[0] for i in w]
        w = [0] + w + [df[col_name].max()]
        df[col_name] = pd.cut(df[col_name], w, labels = range(k))
    return df

## UCI datasets

In [3]:
#German dataset
def German():
    df = pd.read_table(os.path.join(INPUT,"german.data-numeric"),delim_whitespace = True, header = None)
    df = get_discretization_data(df)
    df[24] = df[24]-1    # change label from 1,2 to 0,1
    return df

In [4]:
def Australian():
    df = pd.read_table(os.path.join(INPUT,"australian.dat"),delim_whitespace = True, header = None)
    df = get_discretization_data(df)
    return df

In [5]:
def Crx():
    df = pd.read_csv(os.path.join(INPUT,"crx.data"), header = None)
    # drop entries with ?
    df = df.replace("?", np.nan).dropna()
    # convert category data to numerical data
    df = cat_2_num(df)
    df = get_discretization_data(df)
    return df

In [6]:
def Hepatitis():
    df = pd.read_csv(os.path.join(INPUT,"hepatitis.data"), header = None)
    df = cat_2_num(df)
    df = get_discretization_data(df)
    df[19] = df[19]-1 # change to 0 or 1
    return df

In [7]:
def Ionosphere():
    df = pd.read_csv(os.path.join(INPUT, "ionosphere.data"), header=None)
    df = cat_2_num(df)
    df = get_discretization_data(df)
    return df

## Additional Kaggle datasets

In [8]:
def Pumpkin():
    df = pd.read_excel(os.path.join("datasets",'Pumpkin_Seeds_Dataset.xlsx'), sheet_name='Pumpkin_Seeds_Dataset',engine='openpyxl')
    df = cat_2_num(df)
    df = get_discretization_data(df)
    return df

In [9]:
# 5644 samples, relatively large dataset
def Mushroom():
    df = pd.read_csv(os.path.join(INPUT,'mushrooms.csv'))
    df = df.replace("?", np.nan).dropna()
    df = cat_2_num(df)
    df = get_discretization_data(df)
    order = list(df)
    order = order[1:] + order[:1]
    df = df[order]
    return df

In [10]:
def Diabetes():
    df = pd.read_csv(os.path.join(INPUT,'diabetes_data.csv'), sep=';')
    df = cat_2_num(df)
    df = get_discretization_data(df)
    return df

# Classification

In [11]:
class RuleItem:
    """
    cond_set: a dict with following fashion:
            {item name: value, item name: value, ...}
        e.g.
            {A: 1, B: 1} (A, B are name of columns, here called "item", and in our code should be numerical index
                          but not string)
    class_label: just to identify the class it belongs to.
    dataset: a list returned by read method. (see read.py)
    cond_sup_count, rule_sup_count, support and confidence are number.
    """
    def __init__(self, cond_set, class_label, X_train, y_train):
        self.cond_set = cond_set
        self.class_label = class_label
        self.cond_sup_count, self.rule_sup_count = self._get_sup_count(X_train, y_train)
        self.support = self._get_support(len(X_train))
        self.confidence = self._get_confidence()
        self.is_pruned = False

    # calculate condsupCount and rulesupCount
    def _get_sup_count(self, X_train, y_train):
        cond_sup_count = 0
        rule_sup_count = 0
        for i in range(len(X_train)):
            is_contained = True
            for index in self.cond_set:
                if self.cond_set[index] != X_train[i][index]:
                    is_contained = False
                    break
            if is_contained:
                cond_sup_count += 1
                if self.class_label == y_train[i]:
                    rule_sup_count += 1
        return cond_sup_count, rule_sup_count

    # calculate support count
    def _get_support(self, dataset_size):
        return self.rule_sup_count / dataset_size

    # calculate confidence
    def _get_confidence(self):
        if self.cond_sup_count != 0:
            return self.rule_sup_count / self.cond_sup_count
        else:
            return 0

    # print out the ruleitem
    def print(self):
        cond_set_output = ''
        for item in self.cond_set:
            cond_set_output += '(' + str(item) + ', ' + str(self.cond_set[item]) + '), '
        cond_set_output = cond_set_output[:-2]
        print('<({' + cond_set_output + '}, ' + str(self.cond_sup_count) + '), (' +
              '(class, ' + str(self.class_label) + '), ' + str(self.rule_sup_count) + ')>' + " cond_sup=" + str(self.cond_sup_count) + " rule_sup=" + str(self.rule_sup_count))

    # print out rule
    def print_rule(self):
        cond_set_output = ''
        for item in self.cond_set:
            cond_set_output += '(' + str(item) + ', ' + str(self.cond_set[item]) + '), '
        cond_set_output = '{' + cond_set_output[:-2] + '}'
        print(cond_set_output + ' -> (class, ' + str(self.class_label) + ')')
    
    # prune
    def prune(self, rulelist):
        for rule in rulelist:
            if(self.confidence < rule.confidence):
                self.is_pruned = True

In [12]:
class FrequentRuleitems:
    """
    A set of frequent k-ruleitems, just using set.
    """
    def __init__(self, class_label):
        self.labels = class_label
        self.frequent_ruleitems_set = {}
        for label in class_label:
            self.frequent_ruleitems_set[label] = set()

    # get size of set
    def get_size(self):
        res = 0
        for label in self.labels:
            res += len(self.frequent_ruleitems_set[label])
        return res

    # add a new ruleitem into set
    def add(self, rule_item):
        is_existed = False
        for item in self.frequent_ruleitems_set[rule_item.class_label]:
            if item.cond_set == rule_item.cond_set:
                is_existed = True
                break
        if not is_existed:
            self.frequent_ruleitems_set[rule_item.class_label].add(rule_item)
            
    # print out all frequent ruleitems
    def print(self):
        for label in self.labels:
            for item in self.frequent_ruleitems_set[label]:
                item.print()


class Car:
    """
    Class Association Rules (Car). If some ruleitems has the same condset, the ruleitem with the highest confidence is
    chosen as the Possible Rule (PR). If there're more than one ruleitem with the same highest confidence, we randomly
    select one ruleitem.
    """
    def __init__(self):
        self.rules = set()
        self.pruned_rules = set()

    # print out all rules
    def print_rule(self):
        for item in self.rules:
            item.print_rule()

    # union new car into rules list
    def append(self, car, minsup, minconf):
        for item in car.rules:
            self._add(item, minsup, minconf)

    # add a new rule (frequent & accurate), save the ruleitem with the highest confidence when having the same condset
    def _add(self, rule_item, minsup, minconf):
        if rule_item.is_pruned == True:
            return
        if rule_item.support >= minsup and rule_item.confidence >= minconf:
            if rule_item in self.rules:
                return
            for item in self.rules:
                if item.cond_set == rule_item.cond_set and item.confidence < rule_item.confidence:
                    self.rules.remove(item)
                    self.rules.add(rule_item)
                    return
                elif item.cond_set == rule_item.cond_set and item.confidence >= rule_item.confidence:
                    return
            self.rules.add(rule_item)

    # convert frequent ruleitems into car
    def gen_rules(self, frequent_ruleitems, minsup, minconf):
        for label in frequent_ruleitems.labels:
            for item in frequent_ruleitems.frequent_ruleitems_set[label]:
                self._add(item, minsup, minconf)

# invoked by candidate_gen, join two items to generate candidate
def join(item1, item2, X_train, y_train, minsup):
    category1 = list(item1.cond_set.keys())
    category2 = list(item2.cond_set.keys())
    category1.sort()
    category2.sort()
    for i in range(len(category1) - 1):
        if category1[i] != category2[i]:
            return None
        if item1.cond_set[category1[i]] != item2.cond_set[category1[i]]:
            return None
    i = len(category1) - 1
    if category1[i] == category2[i]:
        return None
    new_cond_set = item1.cond_set.copy()
    new_cond_set[category2[i]] = item2.cond_set[category2[i]]
    new_ruleitem = RuleItem(new_cond_set, item1.class_label, X_train, y_train)
    if new_ruleitem.support < minsup:
        return None
    new_ruleitem.prune([item1, item2])
    return new_ruleitem



# similar to Apriori-gen in algorithm Apriori
def candidate_gen(frequent_ruleitems, X_train, y_train, minsup):
    returned_frequent_ruleitems = FrequentRuleitems(frequent_ruleitems.labels)
    for label in frequent_ruleitems.labels:
        for item1 in frequent_ruleitems.frequent_ruleitems_set[label]:
            for item2 in frequent_ruleitems.frequent_ruleitems_set[label]:
                new_ruleitem = join(item1, item2, X_train, y_train, minsup)
                if new_ruleitem:
                    returned_frequent_ruleitems.add(new_ruleitem)
                    if returned_frequent_ruleitems.get_size() >= 2000:      # not allow to store more than 2000 ruleitems
                        return returned_frequent_ruleitems
    return returned_frequent_ruleitems


# main method, implementation of CBA-RG algorithm
def rule_generator(X_train, y_train, minsup, minconf):

    # get large 1-ruleitems and generate rules
    class_label = set(y_train)
    frequent_ruleitems = FrequentRuleitems(class_label)
    car = Car()
    for column in range(0, len(X_train[0])):
        distinct_value = set([x[column] for x in X_train])
        for value in distinct_value:
            cond_set = {column: value}
            for classes in class_label:
                rule_item = RuleItem(cond_set, classes, X_train, y_train)
                if rule_item.support >= minsup:
                    frequent_ruleitems.add(rule_item)
    car.gen_rules(frequent_ruleitems, minsup, minconf)
    cars = car

    current_cars_number = len(cars.rules)
    while frequent_ruleitems.get_size() > 0 and current_cars_number <= 20000:
        candidate = candidate_gen(frequent_ruleitems, X_train, y_train, minsup)
        frequent_ruleitems = FrequentRuleitems(class_label)
        car = Car()
        for label in candidate.labels:
            for item in candidate.frequent_ruleitems_set[label]:
                if item.support >= minsup:
                    frequent_ruleitems.add(item)
        car.gen_rules(frequent_ruleitems, minsup, minconf)
        cars.append(car, minsup, minconf)
        current_cars_number = len(cars.rules)

    return cars

In [13]:
def is_satisfy(X_train, y_train, rule):
    for item in rule.cond_set:
        if X_train[item] != rule.cond_set[item]:
            return None
    if y_train == rule.class_label:
        return True
    else:
        return False


class Classifier:
    """
    This class is our classifier. The rule_list and default_class are useful for outer code.
    """
    def __init__(self):
        self.rule_list = list()
        self.default_class = None
        self._error_list = list()
        self._default_class_list = list()

    # insert a rule into rule_list, then choose a default class, and calculate the errors (see line 8, 10 & 11)
    def insert(self, rule, X_train, y_train):
        self.rule_list.append(rule)             # insert r at the end of C
        self._select_default_class(y_train)     # select a default class for the current C
        self._compute_error(X_train, y_train)            # compute the total number of errors of C

    # select the majority class in the remaining data
    def _select_default_class(self, y_train):
        class_column = y_train
        class_label = set(class_column)
        max = 0
        current_default_class = None
        for label in class_label:
            if class_column.count(label) >= max:
                max = class_column.count(label)
                current_default_class = label
        self._default_class_list.append(current_default_class)

    # compute the sum of errors
    def _compute_error(self, X_train, y_train):
        if len(X_train) <= 0:
            self._error_list.append(sys.maxsize)
            return

        error_number = 0

        # the number of errors that have been made by all the selected rules in C
        for i in range(len(X_train)):
            is_cover = False
            for rule in self.rule_list:
                if is_satisfy(X_train[i], y_train[i], rule):
                    is_cover = True
                    break
            if not is_cover:
                error_number += 1

        # the number of errors to be made by the default class in the training set
        class_column = y_train
        error_number += len(class_column) - class_column.count(self._default_class_list[-1])
        self._error_list.append(error_number)

    # see line 14 and 15, to get the final classifier
    def discard(self):
        # find the first rule p in C with the lowest total number of errors and drop all the rules after p in C
        index = self._error_list.index(min(self._error_list))
        self.rule_list = self.rule_list[:(index+1)]
        self._error_list = None

        # assign the default class associated with p to default_class
        self.default_class = self._default_class_list[index]
        self._default_class_list = None

    # just print out all selected rules and default class in our classifier
    def print(self):
        for rule in self.rule_list:
            rule.print()
        print("default_class:", self.default_class)


# sort the set of generated rules car according to the relation ">", return the sorted rule list
def sort(car):
    def cmp_method(a, b):
        if a.confidence < b.confidence:     # 1. the confidence of ri > rj
            return 1
        elif a.confidence == b.confidence:
            if a.support < b.support:       # 2. their confidences are the same, but support of ri > rj
                return 1
            elif a.support == b.support:
                if len(a.cond_set) < len(b.cond_set):   # 3. both confidence & support are the same, ri earlier than rj
                    return -1
                elif len(a.cond_set) == len(b.cond_set):
                    return 0
                else:
                    return 1
            else:
                return -1
        else:
            return -1

    rule_list = list(car.rules)
    rule_list.sort(key=cmp_to_key(cmp_method))
    return rule_list


# main method of CBA-CB: M1
def classifier_builder_m1(cars, X_train, y_train):
    classifier = Classifier()
    cars_list = sort(cars)
    for rule in cars_list:
        temp = []
        mark = False
        for i in range(len(X_train)):
            is_satisfy_value = is_satisfy(X_train[i], y_train[i], rule)
            if is_satisfy_value is not None:
                temp.append(i)
                if is_satisfy_value:
                    mark = True
        if mark:
            temp.sort(reverse=True)
            for i in temp:
                X_train.pop(i)
                y_train.pop(i)
            classifier.insert(rule, X_train, y_train)
    classifier.discard()
    return classifier

# calculate the error rate of the classifier on the dataset
def predict(classifier, X_test):
    pred = []
    for case in X_test:
        is_satisfy_value = False
        for rule in classifier.rule_list:
            match = True
            for item in rule.cond_set:
                if case[item] != rule.cond_set[item]:
                    match = False
                    break
            if match == True:
                pred.append(rule.class_label)
                is_satisfy_value = True
                break
        if is_satisfy_value == False:
            pred.append(classifier.default_class)
    return pred

In [14]:
# 10-fold cross-validations on CBA (M1)
def cross_validate_m1(dataset, minsup=0.01, minconf=0.8, quiet=False):

    kf = KFold(n_splits=10, shuffle=True)
    cba_rg_total_runtime = 0
    cba_cb_total_runtime = 0
    total_car_number = 0
    total_classifier_rule_num = 0
    accs = []
    fs = []
    k = 0

    X = [x[:-1] for x in dataset]
    y = [x[-1] for x in dataset]

    for train_index, test_index in kf.split(X):
        if not quiet:
            print("\nRound %d:" % k)

        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

        start_time = time.time()
        cars = rule_generator(X_train, y_train, minsup, minconf)
        end_time = time.time()
        cba_rg_runtime = end_time - start_time
        cba_rg_total_runtime += cba_rg_runtime


        start_time = time.time()
        classifier_m1 = classifier_builder_m1(cars, X_train, y_train)
        end_time = time.time()
        cba_cb_runtime = end_time - start_time
        cba_cb_total_runtime += cba_cb_runtime

        pred = predict(classifier_m1, X_test)
        
        acc = metrics.accuracy_score(y_test, pred)
        f = metrics.f1_score(y_test, pred, pos_label = 1)
        # ctx = metrics.confusion_matrix(y_test, pred)
        # sns.heatmap(ctx, cmap='Oranges', annot=True, fmt='g')
        # plt.show()
        accs.append(acc)
        fs.append(f)

        total_car_number += len(cars.rules)
        total_classifier_rule_num += len(classifier_m1.rule_list)

        if not quiet:
            print(f"\nCBA's accuracy with pruning: {(acc):.2}")
            print(f"CBA's f1-score with pruning: {(f):.2}")
            print("No. of CARs with pruning: %d" % len(cars.rules))
            print("CBA-RG's run time with pruning: %.2lf s" % cba_rg_runtime)
            print("CBA-CB M1's run time with pruning: %.2lf s" % cba_cb_runtime)
            print("No. of rules in classifier of CBA-CB M1 with pruning: %d" % len(classifier_m1.rule_list))

        k += 1

    if not quiet:    
        print(f"\nAverage CBA's accuracy with pruning: {(sum(accs)/len(accs)):.2}")
        print(f"Average CBA's f1-score with pruning: {(sum(fs)/len(accs)):.2}")
        print(f"Average No. of CARs with pruning: {int(total_car_number / 10)}")
        print(f"Average CBA-RG's run time with pruning: {(cba_rg_total_runtime / 10):.3f} s")
        print(f"Average CBA-CB M1's run time with pruning: {(cba_cb_total_runtime / 10):.3f} s")
        print(f"Average No. of rules in classifier of CBA-CB M1 with pruning: {int(total_classifier_rule_num / 10)}")
    return sum(accs)/len(accs), sum(fs)/len(fs), int(total_car_number / 10), (cba_rg_total_runtime / 10), (cba_cb_total_runtime / 10), int(total_classifier_rule_num / 10)

In [15]:
dataset_funcs = [German, Australian, Crx, Hepatitis, Ionosphere, Pumpkin, Mushroom, Diabetes]
stats_list = {'dataset':[], 'accuracy':[], 'f1_score':[], 'CBA_count':[], 'CBA_RG_runtime':[], 'CBA_CB_runtime':[], 'rule_count':[] }
for dataset_getter in dataset_funcs:
    print(f"Classifying {dataset_getter.__name__} dataset")
    avg_acc, avg_f1, avg_car_cnt, avg_rg_runtime, avg_cb_runtime, avg_rule_cnt = cross_validate_m1(dataset_getter().values.tolist(), quiet=False)
    stats_list['dataset'].append(dataset_getter.__name__)
    stats_list['accuracy'].append(avg_acc)
    stats_list['f1_score'].append(avg_f1)
    stats_list['CBA_count'].append(avg_car_cnt)
    stats_list['CBA_RG_runtime'].append(avg_rg_runtime)
    stats_list['CBA_CB_runtime'].append(avg_cb_runtime)
    stats_list['rule_count'].append(avg_rule_cnt)
pd.DataFrame(stats_list).to_csv(os.path.join(OUTPUT, 'CBA_results.csv'))

Classifying German dataset

Round 0:

CBA's accuracy with pruning: 0.74
CBA's f1-score with pruning: 0.35
No. of CARs with pruning: 1656
CBA-RG's run time with pruning: 21.53 s
CBA-CB M1's run time with pruning: 0.93 s
No. of rules in classifier of CBA-CB M1 with pruning: 154

Round 1:

CBA's accuracy with pruning: 0.73
CBA's f1-score with pruning: 0.34
No. of CARs with pruning: 1234
CBA-RG's run time with pruning: 31.37 s
CBA-CB M1's run time with pruning: 0.98 s
No. of rules in classifier of CBA-CB M1 with pruning: 157

Round 2:

CBA's accuracy with pruning: 0.76
CBA's f1-score with pruning: 0.2
No. of CARs with pruning: 1341
CBA-RG's run time with pruning: 30.94 s
CBA-CB M1's run time with pruning: 0.92 s
No. of rules in classifier of CBA-CB M1 with pruning: 150

Round 3:

CBA's accuracy with pruning: 0.7
CBA's f1-score with pruning: 0.32
No. of CARs with pruning: 7098
CBA-RG's run time with pruning: 36.86 s
CBA-CB M1's run time with pruning: 1.73 s
No. of rules in classifier of CBA