In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Attribute Information: (name of attribute and type of value domain)

    animal_name: Unique for each instance
    hair Boolean
    feathers Boolean
    eggs Boolean
    milk Boolean
    airborne Boolean
    aquatic Boolean
    predator Boolean
    toothed Boolean
    backbone Boolean
    breathes Boolean
    venomous Boolean
    fins Boolean
    legs Numeric (set of values: {0,2,4,5,6,8})
    tail Boolean
    domestic Boolean
    catsize Boolean
    class_type Numeric (integer values in range [1,7])


In [47]:
# Overview of the data
data = pd.read_csv('data/zoo.csv')
data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Only non boolean value is legs, We'll change this value to a boolean indicating 0 if no legs and 1 if at least 1 leg.

In [48]:
data["legs"] = (data["legs"] > 0).astype(int)
data["legs"].head()

0    1
1    1
2    0
3    1
4    1
Name: legs, dtype: int64

In [49]:
# Creating a model to predict if an animal has legs 

from sklearn.model_selection import train_test_split
from sklearn import svm

X = data.drop(['animal_name', 'class_type', 'legs'], axis=1)
y = data['legs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [50]:
from sklearn.metrics.pairwise import manhattan_distances

# Define a custom metric function (Manhattan distance)
def hamming_distance(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred))


# Create a custom Manhattan distance kernel function
def hamming_kernel(X1, X2):
    return -manhattan_distances(X1, X2)

clf = svm.SVC(kernel=hamming_kernel)
clf.fit(X, y)

# Evaluate the model
# The accuracy is the percentage of correct predictions or the hamming distance between the predicted and the true values
# Since the hamming distance is the amount of different boolean values, the accuracy is the distance / the total amount of values
# Note that the hamming distance can be used as a performance metric in boolean classification problems, since the error is the amount of different values

1-hamming_distance(clf.predict(X_test), y_test)/len(y_test)

In [52]:
# predicting if humans have legs, just for fun(1 o 0)
clf.predict([[1,0,0,1,0,0,1,1,1,1,0,0,0,1,0]])

array([1])