<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>Rule Learning</strong></h1>

### <b>Author: Nguyen Dang Huynh Chau</b>

# ✴️ Importing Necessary Libraries and datasets
****

In [1]:
# import libraries which are pandas and numpy
import pandas as pd
import numpy as np

#for plots
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]= 15,10

#Libraries for plotting
# Modules for data visualization
import seaborn as sns
sns.set_theme(style="ticks", color_codes=True) #set theme in seaborn
# scatter matrix library
from pandas.plotting import scatter_matrix

#Libraries for feature scaling
from sklearn.preprocessing import StandardScaler

#Libraries for Validation
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 📲 Data Retrieving
***

> **Sailing Dataset**

In [2]:
sailing = pd.read_csv("Data/sailing-custom-python.csv", delimiter='\s', skipinitialspace = True)

sailing.columns = sailing.columns.str.replace(' ', '') #strip the extra-whitespaces out

print("The shape of the ORGINAL data is (row, column):", str(sailing.shape))

# drop Unnamed, it is just a number given to identify each house
sailing.head(3)

The shape of the ORGINAL data is (row, column): (17, 4)


  return func(*args, **kwargs)


Unnamed: 0,Outlook,Company,Sailboat,Sail
0,rainy,big,big,yes
1,rainy,big,small,yes
2,rainy,med,big,no


> **Zoo Dataset**

In [3]:
zoo = pd.read_csv("Data/zoo-python.csv", delimiter='\s', skipinitialspace = True)

zoo.columns = zoo.columns.str.replace(' ', '') #strip the extra-whitespaces out

print("The shape of the ORGINAL data is (row, column):", str(zoo.shape))

# drop Unnamed, it is just a number given to identify each house
zoo.head(3)

The shape of the ORGINAL data is (row, column): (101, 18)


  return func(*args, **kwargs)


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type,name
0,Yes,No,No,Yes,No,No,Yes,Yes,Yes,Yes,No,No,4.0,No,No,Yes,mammal,aardvark
1,Yes,No,No,Yes,No,No,No,Yes,Yes,Yes,No,No,4.0,Yes,No,Yes,mammal,antelope
2,No,No,Yes,No,No,Yes,Yes,Yes,Yes,No,No,Yes,0.0,Yes,No,No,fish,bass


# 🔈 Data Information
****

> **Sailing Dataset**

In [4]:
print ("The shape of the train data is (row, column):"+ str(sailing.shape))
print (sailing.info())

The shape of the train data is (row, column):(17, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Outlook   17 non-null     object
 1   Company   17 non-null     object
 2   Sailboat  17 non-null     object
 3   Sail      17 non-null     object
dtypes: object(4)
memory usage: 672.0+ bytes
None


> **Zoo Dataset**

In [5]:
print ("The shape of the train data is (row, column):"+ str(zoo.shape))
print (zoo.info())

The shape of the train data is (row, column):(101, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   hair      101 non-null    object 
 1   feathers  101 non-null    object 
 2   eggs      101 non-null    object 
 3   milk      101 non-null    object 
 4   airborne  101 non-null    object 
 5   aquatic   101 non-null    object 
 6   predator  101 non-null    object 
 7   toothed   101 non-null    object 
 8   backbone  101 non-null    object 
 9   breathes  101 non-null    object 
 10  venomous  101 non-null    object 
 11  fins      101 non-null    object 
 12  legs      101 non-null    float64
 13  tail      101 non-null    object 
 14  domestic  101 non-null    object 
 15  catsize   101 non-null    object 
 16  type      101 non-null    object 
 17  name      101 non-null    object 
dtypes: float64(1), object(17)
memory usage: 14.3+ KB
None


# Entropy Function:
****

In [6]:
import math

def entropy(data, target):
    vCounts = pd.value_counts(data[target])
    dataSize = data.shape[0] # or data[targetClass].size
    entropy_sum = 0
    for value in vCounts:
        proportion = value / dataSize
        entropy_sum = entropy_sum - (proportion * math.log(proportion, 2))
    return entropy_sum


In [7]:
entropy(sailing, 'Sail')

0.9975025463691153

In [8]:
entropy(zoo, 'type')

2.390559682294039

# Majority Class:
****

In [9]:
def majority_class_1(data, target):
    majority = 0
    cl = ''
    vCounts = pd.value_counts(data[target])
    for value in vCounts.axes[0]:
        count = vCounts[value]
        if count > majority:
            majority = count
            cl = value
    return cl
        
def majority_class_2(data, target):
    counts = pd.value_counts(data[target])
    max_name = counts.idxmax()
    return max_name


In [10]:
majority_class_1(zoo, 'type')

'mammal'

In [11]:
majority_class_2(zoo, 'type')

'mammal'

# Rule Learner:
****

In [15]:
def simpler_rule_learner(data, target):
    all_attributes = set(data.columns)
    all_attributes = all_attributes.difference([target,])
   
    while data.shape[0] > 0:
        if entropy(data, target) == 0:
            print("otherwise =>", majority_class(data,target))
            data = data.iloc[0:0]
        else:
            best_entropy = entropy(data, target)
            best_attribute = ''
            best_value = ''
            best_data = data
      
            for attribute in all_attributes:
                vCounts = pd.value_counts(data[attribute])
                for value in vCounts.axes[0]:
                    data2 = data.loc[data[attribute] == value]
                    if entropy(data2, target) < best_entropy:
                        best_entropy = entropy(data2, target)
                        best_attribute = attribute
                        best_value = value
                        best_data = data2
       
      
    print(best_attribute, '=', best_value, '=>', majority_class(best_data,target))
    data = data.loc[data[best_attribute] != best_value] 

In [None]:
simpler_rule_learner(zoo, 'type')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/huynhchau/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/l5/0ygc5m0x66xc7d4v2qzjjv0h0000gn/T/ipykernel_13396/127033035.py", line 1, in <module>
    simpler_rule_learner(zoo, 'type')
  File "/var/folders/l5/0ygc5m0x66xc7d4v2qzjjv0h0000gn/T/ipykernel_13396/1097770065.py", line 18, in simpler_rule_learner
    data2 = data.loc[data[attribute] == value]
  File "/Users/huynhchau/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 931, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/Users/huynhchau/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 1144, in _getitem_axis
    return self._getbool_axis(key, axis=axis)
  File "/Users/huynhchau/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 948, in _getbool_axis
    key =

TypeError: object of type 'NoneType' has no len()

In [None]:
simpler_rule_learner(sailing, 'Sail')