# Project: Determine Classifier strengths

### Instructions: <br>
- Look at multiple real (as in "not simulated") classification data sets and apply 2-3 classifiers. <br> 
- For each method, find at least one dataset where the chosen method is best (e.g. by performing cross-validation) <br>and the other two methods do not perform as well. <br> There is an extensive list of websites where you can find datasets on the course PM.<br>
-  Explain why the respective classifier is best by inspecting the features <br> e.g. through suitable plots<br>
-  Make sure to properly take care of e.g. stratification if predictors or classes are unbalanced.<br>

In [None]:
"""
I will try differetnt datasets, starting with iris and digits 
and implement 2-3 different classifiers: starting with knn and logistic regression.
Before using classifeiers i will also inspect the datasets to see in they somehow are unbalanced 
and if any datapreprocessing needs to be done.
"""

In [1]:
import numpy as np
from sklearn import datasets, neighbors, preprocessing, utils 

In [23]:
class Dataset:
    """
    Class: dataset
    import_dataset: choose between iris and digits.
    standardise_data: standardise the data set
    shuffle_data: shuffle the data set
    split_data: create n_folds number of folds
    
    """
    
    def __init__(self, dataset): 
        self.X = None
        self.t = None
        self.dataset = dataset
        self.X_folds = []
        self.t_folds = []
        
        
    def import_dataset(self):
        if self.dataset =='iris':
            data = datasets.load_iris()
            self.X = data['data']
            self.t = data['target']
        if self.dataset == 'digits':
            data = datasets.load_digits()
            self.X = data['data']
            self.t = data['target']

    def standardise_data(self):
        standard_scaled = preprocessing.StandardScaler()
        self.X = standard_scaled.fit_transform(self.X)

    def shuffle_data(self):
        self.X, self.t = utils.shuffle(self.X,self.t)
    
    def split_data(self, n_fold = 2):
        self.X_folds = []
        self.t_folds = []
        size_data = len(self.t)
        size_fold = int(size_data/n_fold)
        
        if size_data%n_fold == 0:
            size_fold = size_data/n_fold
            start_index = 0
            stop_index = size_fold -1
            for i in range(n_fold):
                self.X_folds.append(self.X[start_index:stop_index])
                self.t_folds.append(self.t[start_index:stop_index])
                
                start_index += size_fold
                stop_index += size_fold
        else:
            remainder = size_data%n_fold
            start_index = 0
            stop_index = size_fold
            remainder -= 1
            
            for i in range(n_fold):
                self.X_folds.append(self.X[start_index:stop_index])
                self.t_folds.append(self.t[start_index:stop_index])
                
                if remainder == 0:
                    start_index += size_fold 
                    stop_index += size_fold
                
                else: 
                    start_index += size_fold + 1
                    stop_index += size_fold + 1
                    remainder -= 1
        
    

In [42]:
def k_nearest_neighbours(X_train, t_train, X_test, t_test, k=5):
    
    classifier = neighbors.KNeighborsClassifier(k)
    classifier.fit(np.concatenate(X_train), np.concatenate(t_train))
    
    Z = classifier.predict(np.concatenate(X_test))
    return Z

def logistic_regression():
    pass
    


In [68]:
n_folds = 5
iris = Dataset('digits')
iris.import_dataset()
iris.standardise_data()
iris.shuffle_data()
iris.split_data(n_folds)

X_train = iris.X_folds[0:n_folds-3]
t_train = iris.t_folds[0:n_folds-3]
X_test = iris.X_folds[n_folds-2:]
t_test = iris.t_folds[n_folds-2:]

k=5
classifier = neighbors.KNeighborsClassifier(k)
classifier.fit(np.concatenate(X_train), np.concatenate(t_train))

Z = classifier.predict(np.concatenate(X_test))

In [75]:
np.sum(Z==np.concatenate(t_test))

690

In [76]:
np.sum(Z==np.concatenate(t_test))/len(Z)

0.9610027855153204

In [78]:
a = np.arange(10)

In [79]:
a[0:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [82]:
np.concatenate([a])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])