In [76]:
import numpy as np
import pandas as pd
import collections
import time
import pickle
from sklearn.metrics import accuracy_score

In [77]:
class CN2:
    
    _E = []
    _selectors = []
    #******************************************************************************************#
                                           # INIT CN2 #
    #*****************************************************************************************#
    def __init__(self, star_max_size=2, min_significance=1.5):
        self.data = None
        self.star_max_size = star_max_size
        self.min_significance = min_significance
    #******************************************************************************************#
                                           # The MODEL#
    #*****************************************************************************************#
    def fit(self, file_name):
        """
        This function is used to learn the rule-based classification model with the CN2 algorithm.
        :param file_name: the name of the training file in CSV format.
        The file must be located in the '.' folder.
        """
        self.data = pd.read_csv(file_name)
        self._E = self.data.copy()
        self.compute_selectors()
        # This list will contain the complex-class pairs that will represent the rules found by the CN2 algorithm.
        rule_list = []
        classes = self.data.loc[:, [list(self.data)[-1]]]
        classes_count = classes.iloc[:,0].value_counts() # the number of each element in the class 
        total = 0
        while len(self._E) > 0:
            best_cpx = self.find_best_complex()
            print("****************************************************")
            print(best_cpx)
            print("****************************************************")
            if best_cpx is not None:
                # covered_examples - the intersection  table between _E and best_cpx
                covered_examples = self.get_covered_examples(self._E, best_cpx)
                # most_common_class -> number of classes   
                most_common_class, count = self.get_most_common_class(covered_examples)
                self._E = self.remove_examples(self._E, covered_examples)
                print("*************************************************")
                print(self._E)
                print("*************************************************")
                total = 0
                if most_common_class in classes_count.keys():
                    total = classes_count[most_common_class]
                coverage = count / total
                # Precision: how many covered examples belong to the most common class
                precision = count / len(covered_examples)

                rule_list.append((best_cpx, most_common_class, coverage, precision))
            else:
                break

        most_common_class, count = self.get_most_common_class(self.data.index)
        total = classes_count[most_common_class]
        coverage = count / total
        precision = count / len(self.data)
        rule_list.append((None, most_common_class, coverage, precision))

        return rule_list
    
    #*******************************************************************************************#
                                            # Get most Commen Class #
    #*****************************************************************************************#
    def get_most_common_class(self, covered_examples):
        '''
        Returns the most common class among the examples received as parameter. It assumes that the class is the last
        attribute of the examples.
        :param covered_examples: Pandas DataFrame containing the examples from which we want to find the most common
        class.
        :return: label of the most common class.
        '''
        classes = self.data.loc[covered_examples, [list(self.data)[-1]]]
        most_common_class = classes.iloc[:,0].value_counts().index[0]
        count = classes.iloc[:,0].value_counts()[0]
        return most_common_class, count
    
    #********************************************************************************************
                                            # remove_examples #
    #*****************************************************************************************#
    def remove_examples(self, all_examples, indexes):
        '''
        Removes from the dataframe of the remaining examples, the covered examples with the indexes received as parameter.
        :param all_examples: the dataframe from which we want to remove the examples.
        :param indexes: list of index labels that identify the instances to remove.
        :return: the remaining examples after removing the required examples.
        '''
        remaining_examples = all_examples.drop(indexes)
        return remaining_examples

    #******************************************************************************************#
                                           # Search Algo Beam Search #
    #*****************************************************************************************#
    
    # before we need to get all the selectors 
    #--------------------------------------------------------------------------------
    def compute_selectors(self):
        """
        This function computes the selectors from the input data, which are
        the pairs attribute-value, excluding the class attribute.
        Assumption: the class attribute is the last attribute of the dataset.
        """
        attributes = list(self.data)

        # removing the class attribute
        del attributes[-1]

        for attribute in attributes:
            possible_values = set(self.data[attribute])
            for value in possible_values:
                self._selectors.append((attribute, value))
     #-----------------------------------------------------------------------------------
     # also we need to specialize the star
    def specialize_star(self, star, selectors):
        '''
        This function creates a new_star list by combining the complexes in star with the selectors, 
        and removing the non-valid complexes created.
        :param star: the list of complexes to be specialized
        :param selectors: the list of selector with which to specialize star
        :return: the new_star list with the specialized complexes
        '''
        new_star = []
        if len(star) > 0:
            for complex in star:
                for selector in selectors:
                    new_complex = complex.copy()
                    new_complex.append(selector)

                    # Add the new complex only if they are valid
                    count = collections.Counter([x[0] for x in new_complex])
                    duplicate = False
                    for c in count.values():
                        if c > 1:
                            duplicate = True
                            break
                    if not duplicate:
                        new_star.append(new_complex)
        else:
            for selector in selectors:
                new_star.append([selector])
        return new_star
    
    #------------------------------------------------------------------------------------------
    # define the covered examples
    def get_covered_examples(self, all_examples, best_cpx):
        '''
        Returns the indexes of the examples from the list of all examples that are covered by the complex.
        :param all_examples: the dataframe from which we want to find the covered examples.
        :param best_cpx: list of attribute-value tuples.
        :return: the indexes of the covered examples.
        '''
        # Creating a dictionary with the attributes of the best complex as key, and the values of that attribute as a
        # list of values. Then, add all the possible values for the attributes that are not part of the rules of the
        # best complex.
        values = dict()
        [values[t[0]].append(t[1]) if t[0] in list(values.keys()) else values.update({t[0]: [t[1]]}) for t in best_cpx]
        for attribute in list(self.data):
            if attribute not in values:
                values[attribute] = set(self.data[attribute])

        # Getting the indexes of the covered examples
        covered_examples = all_examples[all_examples.isin(values).all(axis=1)]
        return covered_examples.index
    
    # also we need a function to calculate the significance
    def significance(self, tested_complex):
        '''
        This function computes the significance of a complex
        :param tested_complex: the complex for which we want to compute the significance.
        :return: the entropy of the significance.
        '''
        covered_examples = self.get_covered_examples(self._E, tested_complex)
        
        classes = self.data.loc[covered_examples, [list(self.data)[-1]]]
        
        covered_num_instances = len(classes)
        covered_counts = classes.iloc[:,0].value_counts()
        covered_probs = covered_counts.divide(covered_num_instances)

        train_classes = self.data.iloc[:,-1]
        
        
        train_num_instances = len(train_classes)
        train_counts = train_classes.value_counts()
        train_probs = train_counts.divide(train_num_instances)

        significance = covered_probs.multiply(np.log(covered_probs.divide(train_probs))).sum() * 2
        print('classes',classes)
        print('covered_counts',covered_counts)
        print('significance',significance)
        return significance
    #------------------------------------------------------------------------------------------------
    # function to calculate the entropy of the significant complex
    def entropy(self, tested_complex):
        '''
        This function computes the entropy of a complex
        :param tested_complex: the complex for which we want to compute the entropy.
        :return: the entropy of the complex.
        '''
        covered_examples = self.get_covered_examples(self._E, tested_complex)
        classes = self.data.loc[covered_examples, [list(self.data)[-1]]]
        num_instances = len(classes)
        class_counts = classes.iloc[:,0].value_counts()
        class_probabilities = class_counts.divide(num_instances)
        log2 = np.log2(class_probabilities)
        plog2p = class_probabilities.multiply(log2)
        entropy = plog2p.sum() * -1

        return entropy
    
    def find_best_complex(self):
        '''
        This function finds the best complex by continuously specializing the list of 
        the best complex found so far and
        updating the best complex if the new complex found has a lower entropy than the previous one.
        The function keeps searching until the best complex has an accepted significance level.
        :return: the best complex found.
        '''
       
        
        best_complex = None
        best_complex_entropy = float('inf')
        best_complex_significance = 0
        star = []
        
        while True:
            
            entropy_measures = {
                
            }
            
            print('star',star)
            new_star = self.specialize_star(star, self._selectors)
            print("**************************************************************")
            print('****************The new_star is specilized at each level*********************')
            print('new_star',new_star)
            print("**************************************************************")
            for idx in range(len(new_star)):
                tested_complex = new_star[idx]
                significance = self.significance(tested_complex)
                if significance > self.min_significance:
                    entropy = self.entropy(tested_complex)
                    entropy_measures[idx] = entropy
                    
                    if entropy < best_complex_entropy:
                        best_complex = tested_complex.copy()
                        best_complex_entropy = entropy
                        best_complex_significance = significance
            top_complexes = sorted(entropy_measures.items(), key=lambda x: x[1], reverse=False)[:self.star_max_size]
            
            star = [new_star[x[0]] for x in top_complexes]
            
            if len(star) == 0 or best_complex_significance < self.min_significance:
                break
        
        print('************************************Report********************')        
     
        print("best_complex",best_complex)
        print('************************************Report-best-complex********************')
        print("best_complex:,",best_complex)
        print("best_complex_entropy:,",best_complex_entropy)
        print("best_complex_significance:,",best_complex_significance)
        print('*************************************End report ********************')
        return best_complex  
        
    #*********************************************************************************************#
    def print_rules(self, rules):
        '''
        This function prints the rules received as parameter in an understandable way.
        It also prints the coverage and precision of each rule.
        :param rules: the rules that have to be printed.
        '''
        rule_string = ''
        for rule in rules:
            complex = rule[0]
            complex_class = rule[1]
            coverage = rule[2]
            precision = rule[3]

            if complex is not None:
                for idx in range(len(complex)):
                    if idx == 0:
                        rule_string += 'If '
                    rule_string += str(complex[idx][0]) + '=' + str(complex[idx][1])
                    if idx < len(complex)-1:
                        rule_string += ' and '
                rule_string += ', then class=' + complex_class + ' [covered examples = ' + str(coverage) + ', precision = ' \
                               + str(precision) + ']'
            else:
                rule_string += 'Default: class=' + complex_class + ' [covered examples = ' + str(coverage) + ', precision = ' \
                               + str(precision) + ']'
            print(rule_string)
            rule_string = ''
        
        
        
        
        

In [78]:
if __name__ == "__main__":

    # Loin DataSet 
    print('------------------------------')
    print('------------------------------')
    print('Loan-Creadit')
    print('------------------------------')
    cn2 = CN2(2,1)
    train_start = time.time()
    rules = cn2.fit('loan-creadit.csv')
    train_end = time.time()
    print('Training time: ', train_end-train_start, ' s')
    print('Rules:')
    cn2.print_rules(rules)

    with open('Result CN2', 'wb') as f:
        pickle.dump(rules, f)

    # These two lines can be used to load a previously computed set of rules.
    # with open('../Data/output/zoo_rules', 'rb') as f:
    #     rules = pickle.load(f)

#     rules_performance, accuracy = cn2.predict('zoo_test.csv', rules)
#     print('Accuracy: ', accuracy)
#     print('Testing performance:')
#     keys = []
#     vals = []
#     for data in rules_performance:
#         val = []
#         for k, v in data.items():
#             keys.append(k)
#             val.append(v)
#         vals.append(val)

#     table = pd.DataFrame([v for v in vals], columns=list(dict.fromkeys(keys)))
#     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#         print(table)
#     table.to_csv('../Data/output/zoo_performance.csv')

------------------------------
------------------------------
Loan-Creadit
------------------------------
star []
**************************************************************
****************The new_star is specilized at each level*********************
new_star [[(' credit histroy ', 'unknown')], [(' credit histroy ', 'bad')], [(' credit histroy ', 'good')], [(' debt level ', 'low')], [(' debt level ', 'high')], [(' collateral ', 'none')], [(' collateral ', 'adequate')], [(' Income Level ', 'low')], [(' Income Level ', 'medium')], [(' Income Level ', 'high')]]
**************************************************************
classes    Risk Level
1        high
2    moderate
3        high
4         low
5         low
covered_counts high        2
low         2
moderate    1
Name:  Risk Level, dtype: int64
significance 0.007871502461261039
classes     Risk Level
0         high
6         high
7     moderate
13        high
covered_counts high        3
moderate    1
Name:  Risk Level, dtype: i

classes     Risk Level
8          low
11    moderate
12         low
covered_counts low         2
moderate    1
Name:  Risk Level, dtype: int64
significance 1.1267609136166845
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Level
11    moderate
covered_counts moderate    1
Name:  Risk Level, dtype: int64
significance 3.080890081894298
classes     Risk Level
8          low
9          low
12         low
covered_counts low    3
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
star [[(' Income Level ', 'high'), (' credit histroy ', 'unknown')], [(' Income Level ', 'high'), (' credit histroy ', 'bad')]]
**************************************************************
****************The new_star is specilized at each level*********************
new_star [[('

classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes    Risk Level
7    moderate
covered_counts moderate    1
Name:  Risk Level, dtype: int64
significance 3.080890081894298
classes     Risk Level
8          low
9          low
12         low
covered_counts low    3
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes    Risk Level
7    moderate
8         low
covered_counts moderate    1
low         1
Name:  Risk Level, dtype: int64
significance 1.1837700970084166
classes     Risk Level
9          low
12         low
covered_counts low    2
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes     Risk Level
8          low
12         low
covered_counts low    2
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes    Risk Level
7    moderate
9         low
covered_counts moderate    1
low         1
Name:  Risk Level, dtype: int64
significance 1.1837

classes    Risk Level
2    moderate
covered_counts moderate    1
Name:  Risk Level, dtype: int64
significance 3.080890081894298
classes    Risk Level
7    moderate
covered_counts moderate    1
Name:  Risk Level, dtype: int64
significance 3.080890081894298
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Level
9          low
11    moderate
12         low
covered_counts low         2
moderate    1
Name:  Risk Level, dtype: int64
significance 1.1267609136166845
classes     Risk Level
11    moderate
12         low
covered_counts moderate    1
low         1
Name:  Risk Level, dtype: int64
significance 1.1837700970084166
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Le

classes     Risk Level
9          low
11    moderate
12         low
13        high
covered_counts low         2
moderate    1
high        1
Name:  Risk Level, dtype: int64
significance 0.14404932616849864
classes     Risk Level
11    moderate
12         low
13        high
covered_counts moderate    1
low         1
high        1
Name:  Risk Level, dtype: int64
significance 0.08101696834078773
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Level
11    moderate
13        high
covered_counts moderate    1
high        1
Name:  Risk Level, dtype: int64
significance 1.001448540214462
classes     Risk Level
9          low
12         low
covered_counts low    2
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
star [[(' credit histroy ', 'bad')], [(' collate

covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes    Risk Level
9         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
star [[(' collateral

classes     Risk Level
12         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
star [[(' Income Level ', 'medium'), (' credit histroy ', 'good')], [(' Income Level ', 'medium'), (' debt level ', 'high')]]
**************************************************************
****************The new_star is specilized at each level*********************
new_star [[(' Income Level ', 'medium'), (' credit histroy ', 'good'), (' debt level ', 'low')], [(' Income Level ', 'medium'), (' credit histroy ', 'good'), (' debt level ', 'high')], [(' Income Level ', 'medium'), (' credit histroy ', 'good'), (' collateral ', 'none')], [(' Income Level ', 'medium'), (' credit histroy ', 'good'), (' collateral ', 'adequate')], [(' Income Level ', 'medium'), (' debt level ', 'high'), (' credit histroy ', 'unknown')], [(' Income Level '

classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Level
12         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes Empty DataFrame
Columns: [ Risk Level]
Index: []
covered_counts Series([], Name:  Risk Level, dtype: int64)
significance 0.0
classes     Risk Level
12         low
covered_counts low    1
Name:  Risk Level, dtype: int64
significance 2.0592388343623163
classes     Risk Level
12         low
covered_counts low    1
Name:  Risk Level, dt