In [306]:
from sklearn.neighbors import LocalOutlierFactor
import numpy as np
from collections import Counter
from scipy.spatial.distance import euclidean, cityblock, cosine, jensenshannon
from collections import defaultdict
from sklearn.model_selection import train_test_split
import auxiliarfunctions as af
from sklearn.metrics import accuracy_score


In [307]:
class BaseEstimator:
    y_required = True
    fit_required = True

    def _setup_input(self, X, y=None):
        """Ensure inputs to an estimator are in the expected format.

        Ensures X and y are stored as numpy ndarrays by converting from an
        array-like object if necessary. Enables estimators to define whether
        they require a set of y target values or not with y_required, e.g.
        kmeans clustering requires no target labels and is fit against only X.

        Parameters
        ----------
        X : array-like
            Feature dataset.
        y : array-like
            Target values. By default is required, but if y_required = false
            then may be omitted.
        """
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if X.size == 0:
            raise ValueError("Got an empty matrix.")

        if X.ndim == 1:
            self.n_samples, self.n_features = 1, X.shape
        else:
            self.n_samples, self.n_features = X.shape[0], np.prod(X.shape[1:])

        lof = LocalOutlierFactor(n_neighbors=20)
        
        # lof_scores_train = lof.negative_outlier_factor_
        # lof_scores_train = (lof_scores_train - min(lof_scores_train)) / (max(lof_scores_train) - min(lof_scores_train))
        self.weigth = lof.fit_predict(X)
        self.X = X

        if self.y_required:
            if y is None:
                raise ValueError("Missed required argument y")

            if not isinstance(y, np.ndarray):
                y = np.array(y)

            if y.size == 0:
                raise ValueError("The targets array must be no-empty.")

        self.y = y

    def fit(self, X, y=None):
        self._setup_input(X, y)

    def predict(self, X=None):
        #  print("-----")
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if self.X is not None or not self.fit_required:
            return self._predict(X)
        else:
            raise ValueError("You must call `fit` before `predict`")

    def _predict(self, X=None):
        raise NotImplementedError()
    

In [308]:
# coding:utf-8

class KNNBase(BaseEstimator):
    def __init__(self, k=5, distance_func = None):
        """Base class for Nearest neighbors classifier and regressor.

        Parameters
        ----------
        k : int, default 5
            The number of neighbors to take into account. If 0, all the
            training examples are used.
        distance_func : function, default euclidean distance
            A distance function taking two arguments. Any function from
            scipy.spatial.distance will do.
        """

        self.k = None if k == 0 else k  # l[:None] returns the whole list
        self.distance_func = distance_func
        self.distance = [euclidean, cityblock, cosine, jensenshannon]
        

    def aggregate(self, neighbors_targets):
        raise NotImplementedError()

    def _predict(self, X=None):
        lof = LocalOutlierFactor(n_neighbors=20, algorithm='kd_tree')
        train_weigth = lof.fit_predict(X)
        print(train_weigth)
        
        predictions = [self._predict_x(x,weigth) for x, weigth in zip(X,train_weigth)]
        predictions = [Counter(prediction).most_common(1)[0][0] for prediction in predictions]
        print(predictions)
        
        return np.array(predictions)

    def _predict_x(self, x, train_weigth):
        """Predict the label of a single instance x."""
        prediction = []
        if self.distance_func is None:
            for k in self.distance:

                # compute distances between x and all examples in the training set.
                distances = (k(x, example) for example in self.X)

                # Sort all examples by their distance to x and keep their target value.
                neighbors = sorted(((dist, target,weigth) for (dist, target,weigth) in zip(distances, self.y, self.weigth)), key=lambda x: x[0])

                # print("Neighbors with distance:", neighbors[: self.k])
                # Get targets of the k-nn and aggregate them (most common one or
                # average).

                neighbors_targets = [(target,weigth) for (_, target, weigth) in neighbors[: self.k]]

                prediction.append(self.aggregate(neighbors_targets,train_weigth))

        return prediction


class KNNClassifier(KNNBase):
    """Nearest neighbors classifier.

    Note: if there is a tie for the most common label among the neighbors, then
    the predicted label is arbitrary."""

    def aggregate(self, neighbors_targets,train_weigth):
        """Return the most common target label."""
        # print("Neighbors_target:", neighbors_targets)
        
        # Inicialize um defaultdict com valor float
        weighted_dict = defaultdict(float)
        weight_sum = 0
        # Percorra os vizinhos e atualize o dicionário ponderado
        for target, weight in neighbors_targets:
            weighted_dict[target] += weight
            weight_sum += 1
        
        
        print(weighted_dict.items())
        for (target,_) in weighted_dict.items() :
            weighted_dict[target] /= weight_sum
            
        weighted_dict = dict(weighted_dict)

        if train_weigth == -1:
            max_target = min(weighted_dict.items(), key=lambda item: item[1])
        else:
            max_target = max(weighted_dict.items(), key=lambda item: item[1])

        # Desempacote a chave e o valor máximo
        max_target_key, max_target_value = max_target
        # print("target:", max_target_key)
        return max_target_key


class KNNRegressor(KNNBase):
    """Nearest neighbors regressor."""

    def aggregate(self, neighbors_targets):
        """Return the mean of all targets."""

        return np.mean(neighbors_targets)


In [309]:
# create a empty list

datasets = []
datatraintest = []
#ids = [37,163,40498,187,41,1527] #,1459
#1527 -> Anomaly Detection Meta-Analysis Benchmarks
#1459
ids = [41]
# push the fetch_and_prepare_dataset(dataset_id) with id 37 and 163
for i in ids:
    datasets.append(af.fetch_and_prepare_dataset(i))

# split the datasets into train and test
for i in range(len(ids)):
    datatraintest.append(train_test_split(datasets[i][0], datasets[i][1], test_size=0.2))

In [310]:
classifiers = [KNNClassifier(k=5)]
classifiers_names = ['KNN Modified']

In [311]:

def classify_and_evaluate(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [312]:
for i in range(len(datasets)):
    X_train, X_test, y_train, y_test = datatraintest[i]
    for classifier, classifier_name in zip(classifiers, classifiers_names):
        accuracy = classify_and_evaluate(classifier, X_train, X_test, y_train, y_test)
        print(f'Accuracy for dataset {i} with classifier {classifier_name}: {accuracy}')
        # 4.75
        # 3 -> 1.88 -> 0.39
        # 1 -> 0.95 -> 0.2
        # 5 -> 1.92 -> 
    print(np.array(y_test))

[-1 -1  1  1 -1  1 -1  1 -1  1 -1 -1  1  1 -1  1  1  1  1  1 -1  1  1  1
 -1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1 -1]
dict_items([(0, 1.0), (1, 2.0)])
dict_items([(0, 1.0), (1, 0.0)])
dict_items([(0, 1.0), (1, 0.0)])
dict_items([(0, 2.0), (1, 1.0)])
dict_items([(0, 3.0)])
dict_items([(0, 2.0), (1, 1.0)])
dict_items([(2, 1.0), (0, 2.0)])
dict_items([(0, 2.0), (1, 1.0)])
dict_items([(2, 1.0), (0, 2.0), (1, 2.0)])
dict_items([(2, 1.0), (1, 2.0), (0, 2.0)])
dict_items([(0, 3.0), (2, 1.0), (1, 1.0)])
dict_items([(0, 2.0), (1, 3.0)])
dict_items([(0, 3.0), (2, 1.0), (1, 1.0)])
dict_items([(2, 1.0), (0, 3.0), (1, 1.0)])
dict_items([(0, 4.0), (2, 1.0)])
dict_items([(0, 4.0), (2, 1.0)])
dict_items([(0, 5.0)])
dict_items([(0, 5.0)])
dict_items([(0, 5.0)])
dict_items([(0, 4.0), (2, 1.0)])
dict_items([(0, 2.0), (1, 3.0)])
dict_items([(1, 3.0), (0, 2.0)])
dict_items([(0, 2.0), (1, 3.0)])
dict_items([(0, 4.0), (1, 1.0)])
dict_items([(1, 2.0), (3, 1.0)])
dict_items([(1, 0.0), (3, 1.0)])
