# OneR Classification
- predict whether a car is desirable to purchase with 71.1% accuracy

data source: <a href='car.data'>archive.ics.uci.edu/ml/datasets/</a>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import defaultdict
from operator import itemgetter
import pandas as pd
import numpy as np

In [2]:
with open('car.names') as fhand:
    doc = fhand.read()
    print(doc)

1. Title: Car Evaluation Database

2. Sources:
   (a) Creator: Marko Bohanec
   (b) Donors: Marko Bohanec   (marko.bohanec@ijs.si)
               Blaz Zupan      (blaz.zupan@ijs.si)
   (c) Date: June, 1997

3. Past Usage:

   The hierarchical decision model, from which this dataset is
   derived, was first presented in 

   M. Bohanec and V. Rajkovic: Knowledge acquisition and explanation for
   multi-attribute decision making. In 8th Intl Workshop on Expert
   Systems and their Applications, Avignon, France. pages 59-78, 1988.

   Within machine-learning, this dataset was used for the evaluation
   of HINT (Hierarchy INduction Tool), which was proved to be able to
   completely reconstruct the original hierarchical model. This,
   together with a comparison with C4.5, is presented in

   B. Zupan, M. Bohanec, I. Bratko, J. Demsar: Machine learning by
   function decomposition. ICML-97, Nashville, TN. 1997 (to appear)

4. Relevant Information Paragraph:

   Car Evaluation Database was 

# DataFrame View
- first 5

In [3]:
names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'] 
dfile = 'car.data'
df = pd.read_csv(dfile, header=None, names=names)
df.head() 

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Discretization

In [4]:
buy = dict.fromkeys(set(df["buying"]))
buy["vhigh"] = 0; buy["high"] = 1
buy["med"] = 2; buy["low"] = 3 
df["buying"] = df["buying"].apply(lambda k: buy[k]) 

maint = buy
df["maint"] = df["maint"].apply(lambda k: buy[k]) 

predicate = lambda n: int(n) if n.isnumeric() else 5

df["doors"] = df["doors"].apply(predicate) 

df["persons"] = df["persons"].apply(predicate)

boot = dict.fromkeys(set(df["lug_boot"]))
boot["small"] = 0; boot["med"] = 1; boot["big"] = 2
df["lug_boot"] = df["lug_boot"].apply(lambda k: boot[k])

safe = dict.fromkeys(set(df["safety"]))
safe["low"] = 0; safe["med"] = 1; safe["high"] = 2
df["safety"] = df["safety"].apply(lambda k: safe[k]) 

df.iloc[1]

buying          0
maint           0
doors           2
persons         2
lug_boot        0
safety          1
class       unacc
Name: 1, dtype: object

In [5]:
X = df.drop(["class"], axis=1).values
y = df["class"].values

# OneR Algorithm

In [6]:
def train_feature_value(X, y_true, feature_index, value):
    class_counts = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1
    sorted_class_counts = sorted(class_counts.items(),
                                key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    incorrect_predictions = [class_count for class_value, class_count
                            in class_counts.items()
                            if class_value != most_frequent_class]
    error = sum(incorrect_predictions)
    return most_frequent_class, error

def train_on_feature(X, y_true, feature_index):
    values = set(X[:,feature_index])
    predictors = {}
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(
            X, y_true, feature_index, current_value
        )
        predictors[current_value] = most_frequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error

# Train and Test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [8]:
all_predictors = {}
errors = {}

for feature_index in range(X_train.shape[1]):
    predictors, total_error = train_on_feature(X_train, y_train, feature_index)
    all_predictors[feature_index] = predictors
    errors[feature_index] = total_error
    
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]

print('The best model is base on feature {0} and has {1:.2f} errors'.format(
        best_feature, best_error)) 


The best model is base on feature 0 and has 393.00 errors


In [9]:
model = {'feature': best_feature, 'predictor': all_predictors[best_feature]}
print(model)

{'feature': 0, 'predictor': {0: 'unacc', 1: 'unacc', 2: 'unacc', 3: 'unacc'}}


In [10]:
def predict(X_test, model):
    var = model['feature']
    predictor = model['predictor']
    predicted = np.array([
        predictor[int(sample[var])] for sample in X_test
    ])
    return predicted

In [11]:
predicted = predict(X_test, model)

In [12]:
accuracy = np.mean(predicted == y_test) * 100
print('The test accuracy is {:.1f}%'.format(accuracy))

The test accuracy is 71.1%


In [14]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         acc       0.00      0.00      0.00        84
        good       0.00      0.00      0.00        19
       unacc       0.71      1.00      0.83       307
       vgood       0.00      0.00      0.00        22

   micro avg       0.71      0.71      0.71       432
   macro avg       0.18      0.25      0.21       432
weighted avg       0.51      0.71      0.59       432

