# Machine Learning and Data Mining - Lab 2 - Davide Gallitelli

## Task 1 - Build a majority class classifier

In [1]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class MajorityClassifier(BaseEstimator, ClassifierMixin):
    
    # define initialization
    def __init__(self):
        # code
        print ("init Classifier")
    
    # Fitting function
    def fit (self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Use unique_labels to return an ordered array of the labels found
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        self.majVote_ = np.bincount(self.y_).argmax()
        # Return the classifier
        return self
    
    # Prediction function
    def predict (self, X):

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)
        
        # Predict according to majority class
        return np.asarray([self.majVote_ for _ in X])

## Task 2 - Test the majority vote classifier and justify the evaluation result

In [2]:
# Import the necessary classes
import numpy as np
from sklearn import datasets

# Load and parse the data file
bc = datasets.load_breast_cancer()
bc_X = bc.data
bc_Y = bc.target
np.unique(bc_Y)

# Split bc data in train and test data
# A random permutation, to split the data randomly
np.random.seed ( 0 )
indices = np.random.permutation(len(bc_X))
# Take some elements from the shuffled array
bc_X_train = bc_X[indices[:-10]]
bc_Y_train = bc_Y[indices[:-10]]
bc_X_test = bc_X[indices[-10:]]
bc_Y_test = bc_Y[indices[-10:]]

bc_Y_test

array([1, 1, 1, 1, 0, 0, 0, 1, 1, 1])

In [3]:
# test using the previously defined majority vote classifier
mjclass = MajorityClassifier()
mjclass.fit(bc_X_train, bc_Y_train)
prediction = mjclass.predict(bc_X_test)
prediction

init Classifier


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [4]:
# Evaluate score of this classifier
from sklearn.metrics import accuracy_score

accuracy_score(bc_Y_test, prediction)

0.69999999999999996

As the majority of the labels are '1', the majority vote classifier will predict always a '1' label. 
In this example, it will predict correctly almost 70% of the time, therefore having an error around 30%.

## Task 3 - A better classifier

In [5]:
from sklearn.metrics import euclidean_distances

class MyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        return
    def fit(self, X, y):
        X, y = check_X_y(X, y) #check validity of input
        self.X_ = X
        self.y_ = y
        self.classes_ = unique_labels(y)
        return self
    def predict(self, X):
        check_is_fitted(self, ['X_', 'y_']) #check if fit has been called
        # Look for the closest point
        X = check_array(X) #check input validity
        closest = np.argmin(euclidean_distances(X,self.X_),axis=1)
        return self.y_[closest]

In [6]:
# classify with the newly defined MyClassifier
myclass = MyClassifier()
myclass.fit(bc_X_train, bc_Y_train)
myprediction = myclass.predict(bc_X_test)
myprediction

array([1, 1, 1, 1, 0, 0, 0, 1, 1, 1])

In [7]:
# check accuracy score
accuracy_score(bc_Y_test, myprediction)

1.0