In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
df = pd.read_csv('./car_evaluation.csv')
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [31]:
for col in df.columns:
    print(col, set(df[col].tolist()))

buying {'low', 'vhigh', 'med', 'high'}
maint {'low', 'vhigh', 'med', 'high'}
doors {'2', '3', '4', '5more'}
persons {'2', '4', 'more'}
lug_boot {'small', 'big', 'med'}
safety {'low', 'med', 'high'}
class {'vgood', 'acc', 'good', 'unacc'}


In [32]:
df['class'] = df['class'].replace(['acc', 'vgood', 'good'], 1)

In [33]:
df['class'] = df['class'].replace('unacc', 0)

  df['class'] = df['class'].replace('unacc', 0)


In [34]:
for col in df.columns:
    print(col, set(df[col].tolist()))

buying {'low', 'vhigh', 'med', 'high'}
maint {'low', 'vhigh', 'med', 'high'}
doors {'2', '3', '4', '5more'}
persons {'2', '4', 'more'}
lug_boot {'small', 'big', 'med'}
safety {'low', 'med', 'high'}
class {0, 1}


In [35]:
def candidate_elimination(df):
    features = df.columns[:-1]
    target_col = 'class'

    def consistent(hypothesis, example_attributes, label):
        for i in range(len(hypothesis)):
            if hypothesis[i] == 'Ø':
                return label == 0
            if hypothesis[i] != '?' and hypothesis[i] != example_attributes[i]:
                return label == 0
        return label == 1

    def more_general(h1, h2):
        for x, y in zip(h1, h2):
            if x != '?' and x != y:
                return False
        return True

    def less_general(h1, h2):
        for x, y in zip(h1, h2):
            if y == '?':
                continue
            if x != y:
                return False
        return True

    def minimal_generalizations(s, example):
        temp = s[:]
        for i in range(len(s)):
            if s[i] == 'Ø':
                temp[i] = example[i]
            elif s[i] != example[i]:
                temp[i] = '?'
        return [temp]

    def minimal_specializations(g, example, feature_values):
        specializations = []
        for i in range(len(g)):
            if g[i] == '?':
                # adds any other feature value, except the current feature of the example
                for val in feature_values[i]:
                    if val != example[i]:
                        temp = g[:]
                        temp[i] = val
                        specializations.append(temp)
            elif g[i] != example[i]:
                # directly adds null
                temp = g[:]
                temp[i] = 'Ø'
                specializations.append(temp)
        return specializations

    feature_values = [df[feature].unique().tolist() for feature in features]

    S = [['Ø'] * len(features)]
    G = [['?'] * len(features)]

    for _, row in df.iterrows():
        example = row[features].tolist()
        label = row[target_col]

        if label == 1:
            G = [g for g in G if consistent(g, example, 1)]

            S_new = []
            for s in S:
                if consistent(s, example, 1):
                    S_new.append(s)
                else:
                    for h in minimal_generalizations(s, example):
                        if consistent(h, example, 1) and any(more_general(g, h) for g in G):
                            S_new.append(h)
                            break

            S_minimal = []
            for h in S_new:
                remove = False
        
                if any(h != h2 and more_general(h, h2) for h2 in S_new):
                    remove = True 

                if not remove:
                    S_minimal.append(h)
                    
            S = S_minimal

        else:
            # Negative example
            S = [s for s in S if consistent(s, example, 0)]

            G_new = []
            for g in G:
                if consistent(g, example, 0):
                    G_new.append(g)
                else:
                    for h in minimal_specializations(g, example, feature_values):
                        if consistent(h, example, 0) and any(less_general(s, h) for s in S):
                            G_new.append(h)

            G_maximal = []
            for h in G_new:
                remove = False
                if any(h != h2 and less_general(h2, h) for h2 in G_new):
                    remove = True

                if not remove:
                    G_maximal.append(h)
 
            G = G_maximal

    return S, G


In [36]:
len(df)

1727

In [50]:
size=20
sample_df = df.sample(n=size, random_state=2825)
S, G = candidate_elimination(sample_df)
print("Most Specific Boundary:", S)
print("Most General Boundary:", G)

Most Specific Boundary: [['?', '?', '3', '4', 'big', '?']]
Most General Boundary: [['?', '?', '3', '4', 'big', '?']]


In [48]:
size = 30
sample_df = df.sample(n=size, random_state=582)
S, G = candidate_elimination(sample_df)
print("Most Specific Boundary:", S)
print("Most General Boundary:", G)

Most Specific Boundary: [['low', 'med', '4', 'more', 'med', 'high']]
Most General Boundary: [['low', '?', '4', 'more', '?', 'high'], ['?', 'med', '4', 'more', '?', 'high'], ['?', '?', '4', 'more', 'med', 'high'], ['low', 'med', '?', 'more', '?', '?'], ['low', '?', '?', 'more', 'med', '?']]


In [39]:
size = 50
sample_df = df.sample(n=size)
S, G = candidate_elimination(sample_df)
print("Most Specific Boundary:", S)
print("Most General Boundary:", G)

Most Specific Boundary: []
Most General Boundary: []
