In [25]:
import numpy as np
from math import log2
import itertools

In [112]:
# Very simple CN2 learner that takes a single attribute per rule
# TBD - generalise to a learner that takes a conjunction of attributes per rule upto some depth

class CN2:
    
    class Rule:
        def __init__(self, attribute, operator, threshold, classes, default_class):
            self.attribute = attribute
            self.operator = operator
            self.threshold = threshold
            self.classes = classes
            self.default_class = default_class
            self.class_counts = np.zeros(len(classes))
            self.strength = 0
            self.class_label = default_class

        def applies(self, example):
            value = example[self.attribute]
            if self.operator == "==":
                return value == self.threshold
            elif self.operator == ">":
                return value > self.threshold
            elif self.operator == "<":
                return value < self.threshold
            elif self.operator == ">=":
                return value >= self.threshold
            elif self.operator == "<=":
                return value <= self.threshold
            else:
                raise ValueError("Invalid operator: {}".format(self.operator))

        def update_counts(self, example):
            class_label = example[-1]
            index = np.where(self.classes == class_label)
            self.class_counts[index] += 1

    def __init__(self,significance=0.5,min_coverage=2,max_depth=2):
        self.significance = significance
        self.min_coverage = min_coverage
        self.rules = []
        self.max_depth=max_depth

    def learn(self, data, target):
        self.rules = []
        self.classes = np.unique(target)
        self.attributes = list(range(data.shape[1]))
        self.examples = np.hstack((data, target.reshape(-1, 1)))
        self.default_class = np.argmax(np.bincount(target))

        while len(self.examples) > 0:
            rule = self.get_best_rule(self.examples)
            if rule is None:
                break
            self.rules.append(rule)
            covered_examples = self.apply_rule(rule, self.examples)
            self.examples = np.delete(self.examples, covered_examples, axis=0)

    def gen_candidates(self,data,target):
        self.attributes = list(range(data.shape[1]))
        self.examples = np.hstack((data, target.reshape(-1, 1)))
        examples=self.examples
        conditions=[]
        for attribute in self.attributes:
            attribute_values = np.unique(examples[:, attribute])
            for value in attribute_values:
                for operator in ["==", ">", "<", ">=", "<="]:
                    conditions+=[(attribute,operator,value)]
        candidates=[[c] for c in conditions]
        for d in range(self.max_depth):
            new_candidates=[]
            for c in conditions: 
                new_candidates+=[cd+[c] for cd in candidates if c not in cd]
            candidates+=new_candidates
            print(d,len(candidates),len(new_candidates))
        self.candidates=candidates
        return 

    # def get_best_rule(self, examples):
    #     best_rule = None
    #     best_score = -1
    #     rule = DeepRule([], self.classes, self.default_class)
    #     conditions=[]
    #     candidates=[]
    #     for attribute in self.attributes:
    #         attribute_values = np.unique(examples[:, attribute])
    #         for value in attribute_values:
    #             for operator in ["==", ">", "<", ">=", "<="]:
    #                 conditions+=[(attribute,operator,value)]
    #     for d in range(self.max_depth):
    #         for c in conditions: 
    #             new_candidates=[cd+c for c in conditions if c not in cd]
    #             candidates+=new_candidates
    #                 pre_conditions=rule.conditions
    #                 if len(pre_conditions)<self.max_depth
    #                 rule.conditions += [(attribute,operator,value)]
    #                 covered_examples = self.apply_rule(rule, examples)
    #                 print(rule.conditions,len(covered_examples))
    #                 if len(covered_examples) < self.min_coverage:
    #                     rule.conditions=pre_conditions
    #                     continue
    #                 score = self.calculate_score(examples,covered_examples,rule)
    #                 print(score)
    #                 if score > best_score:
    #                     best_score = score
    #     if best_score < self.significance:
    #         return None
    #     return best_rule

    def apply_rule(self, rule, examples):
        mask = np.array([rule.applies(example) for example in examples])
        return np.where(mask)[0]

    def calculate_score(self, examples, covered_examples, rule):
        covered_targets=examples[covered_examples,-1]
        class_counts=np.bincount([int(c) for c in list(covered_targets)])
        rule.class_label=np.argmax(class_counts)
        index = np.where(self.classes == rule.class_label)
        for ce in examples[covered_examples]:
            rule.update_counts(ce)
        rule.strength = np.sum(rule.class_counts[index])/np.sum(rule.class_counts)
        return rule.strength
    
    def predict(self,example):
        for rule in self.rules:
            if rule.applies(example):
                return rule.class_label
                break
            else:
                return self.default_class


In [113]:
### To be completed and tested in a proper CN2 implementetion modifying above
class DeepRule:
    def __init__(self, conditions, classes, default_class):
        self.conditions = conditions
        self.classes = classes
        self.default_class = default_class
        self.class_counts = np.zeros(len(classes))
        self.strength = 0
        self.class_label = default_class

    def applies(self, example):
        for attr, op, threshold in self.conditions:
            if op == '<=':
                if example[attr] > threshold:
                    return False
            elif op == '>=':
                if example[attr] < threshold:
                    return False
            elif op == '<':
                if example[attr] >= threshold:
                    return False
            elif op == '>':
                if example[attr] <= threshold:
                    return False
        return True

    def update_counts(self, example):
        class_label = example[-1]
        index = np.where(self.classes == class_label)
        self.class_counts[index] += 1

    def get_score(self, class_freqs, target_class_freqs):
        score = 0
        for class_label, freq in class_freqs.items():
            if freq > 0:
                p = freq / len(self.examples)
                e = target_class_freqs[class_label] / len(self.examples)
                score += freq * np.log2(p / e)
        return score

    def is_valid(self, attr, op, threshold, class_freqs, target_class_freqs, min_coverage):
        new_conditions = self.conditions + [(attr, op, threshold)]
        new_examples = [e for e in self.examples if self.applies(e[0])]
        if len(new_examples) < min_coverage:
            return False
        new_class_freqs = defaultdict(int)
        for _, y in new_examples:
            new_class_freqs[y] += 1
        if len(new_class_freqs) < len(class_freqs):
            return False
        new_score = self.get_score(new_class_freqs, target_class_freqs)
        return new_score > self.score

    def __repr__(self):
        return ' AND '.join([f'{attr} {op} {threshold}' for attr, op, threshold in self.conditions]) + f' => {self.class_label}'


In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
data = iris.data
target = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Initialize the CN2 algorithm
cn2 = CN2(significance=0.9, min_coverage=30)

# Learn the rules from the training data
# cn2.learn(X_train, y_train)

NameError: name 'CN2' is not defined

In [4]:
X_test

array([[6.1, 2.8, 4.7, 1.2],
       [5.7, 3.8, 1.7, 0.3],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.9, 4.5, 1.5],
       [6.8, 2.8, 4.8, 1.4],
       [5.4, 3.4, 1.5, 0.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.9, 3.1, 5.1, 2.3],
       [6.2, 2.2, 4.5, 1.5],
       [5.8, 2.7, 3.9, 1.2],
       [6.5, 3.2, 5.1, 2. ],
       [4.8, 3. , 1.4, 0.1],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.1, 3.8, 1.5, 0.3],
       [6.3, 3.3, 4.7, 1.6],
       [6.5, 3. , 5.8, 2.2],
       [5.6, 2.5, 3.9, 1.1],
       [5.7, 2.8, 4.5, 1.3],
       [6.4, 2.8, 5.6, 2.2],
       [4.7, 3.2, 1.6, 0.2],
       [6.1, 3. , 4.9, 1.8],
       [5. , 3.4, 1.6, 0.4],
       [6.4, 2.8, 5.6, 2.1],
       [7.9, 3.8, 6.4, 2. ],
       [6.7, 3. , 5.2, 2.3],
       [6.7, 2.5, 5.8, 1.8],
       [6.8, 3.2, 5.9, 2.3],
       [4.8, 3. , 1.4, 0.3],
       [4.8, 3.1, 1.6, 0.2]])

In [None]:
cn2.gen_candidates(X_train,y_train)

0 354025 353430


In [None]:
print(cn2.candidates[0])

NameError: name 'X_test' is not defined