In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import KBinsDiscretizer


In [33]:
np.random.seed(2)

dataset_size = ["small", "medium", "large"][2]

dataset_info = {
    "small": {
        "dataset_name": "wine",
        "class_name": "Class",
        "drop_fields": []
    },
    "medium": {
        "dataset_name": "breast-cancer-wisconsin",
        "class_name": "Class",
        "drop_fields": ["Sample code number"]
    },
    "large": {
        "dataset_name": "abalone",
        "class_name": "Rings",
        "drop_fields": []
    }
}

dataset_name = dataset_info[dataset_size]["dataset_name"]
class_name = dataset_info[dataset_size]["class_name"]
drop_fields = dataset_info[dataset_size]["drop_fields"]

df = pd.read_csv('../data/' + dataset_name + ".csv")
df = df.drop(drop_fields, axis=1)
df = df.iloc[np.random.permutation(len(df))]

n_cut = int(0.8*len(df))
df_trn = df[:n_cut]
df_tst = df[n_cut:]

X_trn = df_trn.drop(class_name, axis=1)
y_trn = df_trn[class_name]

X_tst = df_tst.drop(class_name, axis=1)
y_tst = df_tst[class_name]


## Clean data

In [34]:
if dataset_name == "wine":
    print("NaN values (Train):")
    print(np.isnan(X_trn).sum())
    print('\n')

    print("NaN values (Test):")
    print(np.isnan(X_tst).sum())
    print('\n')

In [35]:
if dataset_name == "breast-cancer-wisconsin":
    aux_col = X_trn["Bare Nuclei"]
    values, counts = np.unique(aux_col[aux_col != "?"].astype(int), return_counts=True)
    
    most_frequent_value = values[np.argmax(counts)]
    aux_col = aux_col.replace({"?": str(most_frequent_value)})
        
    X_trn["Bare Nuclei"] = aux_col.to_numpy().astype(int)
    
    X_tst["Bare Nuclei"] = X_tst["Bare Nuclei"].replace({"?": str(most_frequent_value)})
    X_tst["Bare Nuclei"] = X_tst["Bare Nuclei"].astype(int)
    

In [36]:
if dataset_name == "abalone":
    print("NaN values (Train):")
    print(np.isnan(X_trn.drop(['Sex'], axis=1)).sum())
    print("Unique values of categorical variables (Train):")
    print(X_trn["Sex"].unique())
    print('\n')

    print("NaN values (Test):")
    print(np.isnan(X_tst.drop(['Sex'], axis=1)).sum())
    print("Unique values of categorical variables (Test):")
    print(X_tst["Sex"].unique())
    print('\n')


NaN values (Train):
Length            0
Diameter          0
Height            0
Whole height      0
Shucked height    0
Viscera height    0
Shell weight      0
dtype: int64
Unique values of categorical variables (Train):
['F' 'M' 'I']


NaN values (Test):
Length            0
Diameter          0
Height            0
Whole height      0
Shucked height    0
Viscera height    0
Shell weight      0
dtype: int64
Unique values of categorical variables (Test):
['I' 'F' 'M']




In [37]:
X_trn.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 265 to 409
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             3341 non-null   object 
 1   Length          3341 non-null   float64
 2   Diameter        3341 non-null   float64
 3   Height          3341 non-null   float64
 4   Whole height    3341 non-null   float64
 5   Shucked height  3341 non-null   float64
 6   Viscera height  3341 non-null   float64
 7   Shell weight    3341 non-null   float64
dtypes: float64(7), object(1)
memory usage: 234.9+ KB


In [38]:
def discretize_df(X_trn, X_tst, n_bins=10):
    
    cols_to_discretize = [col for col in X_trn.columns if X_trn[col].dtype == float or X_trn[col].dtype == int]
    
    if len(cols_to_discretize) == 0:
        return X_trn, X_tst
    
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    
    X_trn_aux = X_trn[cols_to_discretize].to_numpy()
    X_tst_aux = X_tst[cols_to_discretize].to_numpy()
    
    if len(X_trn_aux.shape) > 1:
        X_trn_aux = est.fit_transform(X_trn_aux)
        X_tst_aux = est.transform(X_tst_aux)
        
    else:    
        X_trn_aux = X_trn_aux[:, None]
        X_tst_aux = X_tst_aux[:, None]
        
        X_trn_aux = est.fit_transform(X_trn_aux)
        X_tst_aux = est.transform(X_tst_aux)


        X_trn_aux = X_trn_aux.reshape(X_trn_aux.shape[0])
        X_tst_aux = X_tst_aux.reshape(X_tst_aux.shape[0])
    
    X_trn_aux = X_trn_aux.astype(int).astype(str)
    X_tst_aux = X_tst_aux.astype(int).astype(str)
    
    X_trn[cols_to_discretize] = X_trn_aux
    X_tst[cols_to_discretize] = X_tst_aux
    
    # print(X_trn.head())
    
    # dict_replace = {
    #     3: {'0': 'L', '1': 'M', '2': 'H'},
    #     5: {'0': 'LL', '1': 'L', '2': 'M', '3': 'H', '4': 'HH'}
    # }
    
    # X_trn = X_trn.replace(dict_replace[n_bins])
    # X_tst = X_tst.replace(dict_replace[n_bins])

    # print(X_trn.head())
    
    return X_trn, X_tst
    

In [39]:
X_trn, X_tst = discretize_df(X_trn, X_tst)


In [40]:
X_trn.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole height,Shucked height,Viscera height,Shell weight
265,F,5,5,1,2,2,2,1
642,F,7,6,1,3,2,2,3
3267,F,4,4,1,1,1,1,1
2843,M,7,6,1,4,3,2,3
521,F,3,3,0,0,0,0,0


## Algorithm

In [41]:
def compute_p_delta_over_alpha(X, y, values_x, value_y):
    
    cond_x = np.ones(len(X), dtype=bool)
    for key, value in values_x.items():
        cond_x *= (X[key] == value).to_numpy()

    cond_x_and_y = (y == value_y).to_numpy() * cond_x
    
    count_alpha_x = cond_x.sum()
    count_delta_n_and_alpha_x = cond_x_and_y.sum()
    
    if count_alpha_x > 0:
        return count_delta_n_and_alpha_x / count_alpha_x, count_delta_n_and_alpha_x
    
    return np.nan, 0
    

In [42]:
def filter_dfs(X, y, rule, keep=True):
    cond_x = np.ones(len(X), dtype=bool)
    for key, value in rule.items():
        cond_x *= (X[key] == value).to_numpy()
        
    if not keep:
        cond_x = np.array(1 - cond_x, dtype=bool)
    
    X = X[cond_x]
    y = y[cond_x]
    
    return X, y


In [43]:
attributes_unique_values = {col_name: X_trn[col_name].unique() for col_name in X_trn.columns if col_name != class_name}
class_unique_values = y_trn.unique()

set_attribute_values = set([(attribute_name, attribute_value) for attribute_name in attributes_unique_values for attribute_value in attributes_unique_values[attribute_name]])

print(attributes_unique_values)
print()
print(set_attribute_values)
print()
print(class_unique_values)

{'Sex': array(['F', 'M', 'I'], dtype=object), 'Length': array(['5', '7', '4', '3', '6', '8', '2', '9', '1', '0'], dtype=object), 'Diameter': array(['5', '6', '4', '3', '8', '2', '7', '9', '1', '0'], dtype=object), 'Height': array(['1', '0', '2', '9', '4'], dtype=object), 'Whole height': array(['2', '3', '1', '4', '0', '6', '5', '7', '8', '9'], dtype=object), 'Shucked height': array(['2', '1', '3', '0', '5', '4', '8', '6', '7', '9'], dtype=object), 'Viscera height': array(['2', '1', '0', '3', '4', '5', '7', '6', '9', '8'], dtype=object), 'Shell weight': array(['1', '3', '0', '7', '2', '5', '4', '6', '8', '9'], dtype=object)}

{('Shell weight', '1'), ('Length', '4'), ('Viscera height', '3'), ('Height', '4'), ('Shucked height', '7'), ('Shell weight', '8'), ('Shucked height', '6'), ('Shucked height', '9'), ('Shell weight', '0'), ('Diameter', '5'), ('Shucked height', '1'), ('Viscera height', '7'), ('Length', '3'), ('Viscera height', '6'), ('Whole height', '5'), ('Viscera height', '9'), ('Sh

In [44]:
values, counts = np.unique(y_trn, return_counts=True)

print([(val, n) for (val, n) in zip(values, counts)])

[(1, 1), (2, 1), (3, 13), (4, 46), (5, 93), (6, 215), (7, 311), (8, 455), (9, 547), (10, 499), (11, 390), (12, 207), (13, 171), (14, 98), (15, 80), (16, 58), (17, 49), (18, 31), (19, 26), (20, 22), (21, 9), (22, 5), (23, 8), (24, 2), (26, 1), (27, 2), (29, 1)]


In [45]:
all_Rs = []

for class_value in class_unique_values:
    this_class = 0
    print(f'Using class value {class_value}')
    
    all_Rs_class = []
    still_instances_delta_n = True
    
    X_, y_ = X_trn.copy(), y_trn.copy()
    X_rule, y_rule = X_trn.copy(), y_trn.copy()
    while still_instances_delta_n:
        
        number_of_attributes_of_rule = 0
        creating_rule = True
        X_rule, y_rule = X_.copy(), y_.copy()
        Rule = {}
        set_attributes_not_used = set_attribute_values.copy()
        
        while creating_rule:
            all_ps = []
            for (attribute_name, attribute_value) in set_attributes_not_used:
                values_x = {attribute_name: attribute_value}
                p_delta_alpha, n_delta_alpha = compute_p_delta_over_alpha(X_rule, y_rule, values_x, class_value)
                all_ps.append((attribute_name, attribute_value, (p_delta_alpha, n_delta_alpha)))

            all_ps = [elem for elem in all_ps if not np.isnan(elem[-1][0])]
            all_ps.sort(key=lambda tup: tup[-1])
            
            if len(all_ps):
                rule_attribute_name, rule_attribute_value, (rule_p, rule_n) = all_ps[-1]
                number_of_attributes_of_rule += 1
                
                Rule[rule_attribute_name] = rule_attribute_value
                set_attributes_not_used.remove((rule_attribute_name, rule_attribute_value))
                
                X_rule, y_rule = filter_dfs(X_rule, y_rule, {rule_attribute_name: rule_attribute_value}, keep=True)

                cond_y_rule = (y_rule == class_value).to_numpy()
                if cond_y_rule.sum() == len(cond_y_rule):
                    creating_rule = False
                    all_Rs_class.append((Rule, class_value, len(cond_y_rule)))
                    X_, y_ = filter_dfs(X_, y_, Rule, keep=False)
            
            elif number_of_attributes_of_rule == len(X_trn.columns):
                    creating_rule = False
                    n_aux = (y_rule == class_value).to_numpy().sum()
                    all_Rs_class.append((Rule, class_value, n_aux))
                    X_, y_ = filter_dfs(X_, y_, Rule, keep=False)
            
        cond_y = (y_ == class_value).to_numpy()
        if cond_y.sum() == 0:
            for rule in all_Rs_class:
                all_Rs.append(rule)

            still_instances_delta_n = False

# print(all_Rs)

Using class value 11
Using class value 19
Using class value 10
Using class value 5
Using class value 18
Using class value 9
Using class value 7
Using class value 6
Using class value 4
Using class value 22
Using class value 8
Using class value 12
Using class value 16
Using class value 13
Using class value 23
Using class value 14
Using class value 15
Using class value 17
Using class value 20
Using class value 3
Using class value 1
Using class value 27
Using class value 26
Using class value 29
Using class value 21
Using class value 2
Using class value 24
[({'Whole height': '9', 'Sex': 'F'}, 11, 2), ({'Shucked height': '8', 'Whole height': '7'}, 11, 1), ({'Shucked height': '8', 'Whole height': '9'}, 11, 1), ({'Shucked height': '5', 'Length': '7', 'Diameter': '8'}, 11, 4), ({'Shucked height': '5', 'Viscera height': '6', 'Whole height': '6'}, 11, 4), ({'Shucked height': '5', 'Shell weight': '3', 'Whole height': '7'}, 11, 1), ({'Shucked height': '5', 'Shell weight': '3', 'Whole height': '6'},

In [46]:
for rule in all_Rs:
    s = ''
    rule_attribute_value, class_value, rule_n = rule
    for attribute_name, attribute_value in rule_attribute_value.items():
        s += attribute_name + ' == ' + attribute_value + ' && '
    s = s[:-4] + ' --> ' + str(class_value) + ' (n_instances = ' + str(rule_n) + ')'
    print(s)

Whole height == 9 && Sex == F --> 11 (n_instances = 2)
Shucked height == 8 && Whole height == 7 --> 11 (n_instances = 1)
Shucked height == 8 && Whole height == 9 --> 11 (n_instances = 1)
Shucked height == 5 && Length == 7 && Diameter == 8 --> 11 (n_instances = 4)
Shucked height == 5 && Viscera height == 6 && Whole height == 6 --> 11 (n_instances = 4)
Shucked height == 5 && Shell weight == 3 && Whole height == 7 --> 11 (n_instances = 1)
Shucked height == 5 && Shell weight == 3 && Whole height == 6 --> 11 (n_instances = 1)
Shucked height == 5 && Shell weight == 3 && Sex == M && Viscera height == 4 && Diameter == 8 && Height == 1 && Length == 8 && Whole height == 5 --> 11 (n_instances = 4)
Shucked height == 5 && Whole height == 6 && Diameter == 9 --> 11 (n_instances = 2)
Shucked height == 5 && Viscera height == 4 && Length == 9 --> 11 (n_instances = 1)
Whole height == 5 && Viscera height == 6 --> 11 (n_instances = 3)
Viscera height == 5 && Diameter == 6 && Length == 7 --> 11 (n_instances 

In [47]:
def predict(X, rules, y_trn, dtype=str):
    predictions = pd.Series(np.zeros(len(X)), index=X.index, dtype=dtype)
    predictions[:] = np.nan
    
    for ind, row in X.iterrows():
        valid_rules = []
        for rule in rules:
            rule_attribute_value, class_value, rule_n = rule
            this_rule = True
            for attribute_name, attribute_value in rule_attribute_value.items():
                if row[attribute_name] != attribute_value:
                    this_rule = False
                
            if this_rule:
                valid_rules.append(rule)
                
        if len(valid_rules):
            max_n = -1
            selected_class_value = None
            for rule in valid_rules:
                _, class_value, rule_n = rule
                if rule_n > max_n:
                    max_n = rule_n
                    selected_class_value = class_value
            
            predictions[ind] = selected_class_value
        
        elif np.isnan(predictions[ind]):
            predictions[ind] = y_trn.mode()
    
    return predictions    
    

In [48]:
y_tst_hat = predict(X_tst, all_Rs, y_trn, dtype=y_tst.dtype)


In [59]:
print(y_tst.unique())
y_tst_hat.astype(int).unique()


[ 9 10 11 18  7  8 14 15  5 17 13  6 19 21 20 16 12  4 23  3 25 22]


array([ 5,  9, 10, 11,  8, 13, 12,  6, 18,  7, 14, 19, 16,  4, 22, 15, 17,
       24,  3])

In [54]:
acc = accuracy_score(y_tst, y_tst_hat.astype(int))
precision = precision_score(y_tst, y_tst_hat.astype(int), average='weighted')
recall = recall_score(y_tst, y_tst_hat, average='weighted')
f1 = f1_score(y_tst, y_tst_hat, average='weighted')

print(f'Accuracy: {acc}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Accuracy: 0.22607655502392343
Precision: 0.21151488061117318
Recall: 0.22607655502392343
F1: 0.2091336764051758


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
