## Actividad KNN

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import math

In [2]:
from scipy import stats

In [12]:
def euclidean_distance(a, b):
    """Distancia euclideana entre dos arrays.

    Parametros
    ----------
    a: numpy array
    b: numpy array

    Returns
    -------
    distancia: float
    """
    return (math.dist(a, b)) #ejemplo utilizo return np.linalg.norm(a-b)


def cosine_distance(a, b):
    """Similitud coseno entre dos arrays.

    Parametros
    ----------
    a: numpy array
    b: numpy array

    Returns
    -------
    distancia: float
    """
    return (a@b)/(np.linalg.norm(a)*np.linalg.norm(b))


def manhattan_distance(a, b):
    """Distancia Manhattan entre dos arrays.

    Parametros
    ----------
    a: numpy array
    b: numpy array

    Returns
    -------
    distancia: float
    """
    return sum(abs(e1-e2) for e1, e2 in zip(a,b))


class KNNRegressor:
    """Regresor para KNN.

    Parametros
    ----------
    k: int, opcional (default = 5)
        Vecinos a incluir en la predicción.
    distancia: function, opcional (default = euclidean)
        Métrica de distancia a utilizar.
    """

    def __init__(self, k=5, distance=euclidean_distance):
        """Inicializar el objeto KNNRegressor."""
        self.k = k
        self.distance = distance

    def fit(self, X, y):
        """Ajustar el modelo con "X" como entrenamiento e "y" como objetivo.

        De acuerdo con el algoritmo KNN, los datos de entrenamiento son almacenados.

        Parametros
        ----------
        X: numpy array, shape = (n_observaciones, n_features)
            Conjunto de entrenamiento.
        y: numpy array, shape = (n_observaciones,)
            Valores objetivo.

        Returns
        -------
        self
        """
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        
        return

    def predict(self, X):
        """Devuelve el valor predecido para la entrada X (conjunto de prueba).

        Asume que la forma de X es [n_observaciones de prueba, n_características] donde
        n_features es la misma que las n_features de los datos de
        de entrada.

        Parametros
        ----------
        X: numpy array, shape = (n_observaciones, n_features)
            Conjunto de prueba.

        Returns
        -------
        result: numpy array, shape = (n_observaciones,)
            Valores predecidos para cada dato de entrada.

        """
        self.X_test = X
        values_x=[]
        positions=[]
        results=[]
        for i in self.X_test:
            for position,j in enumerate(self.X_train):
                values_x.append(self.distance(i,j))
                positions.append(position)
            df = pd.DataFrame.from_dict({'Positions':positions, 'x_value':values_x})
            df = df.sort_values(by='x_value')
            df = df.iloc[:self.k,:]
            df['y_values'] = self.y_train[df['Positions']]
            mean = df['y_values'].mean()
            results.append(mean)
        return np.array(results)

In [11]:
def make_data(n_features, n_pts, noise=0.0):
    """
    Make fake data for exploring regression. The features (X) are uniformly
    distributed between -1 and 1. The target is a quadratic polynomial of
    all the features with random coefficients, plus normally distributed
    noise (off by default).

    Parameters
    ----------
    n_features: int, number of columns in the output
    n_pts: int, number of rows
    noise: float, normally distributed noise added to y

    Returns
    -------
    (X, y)
    X: numpy array of shape (n_features, n_pts)
    y: numpy array of shape (n_pts)
    """
    X = stats.uniform(-1, 2).rvs((n_pts, n_features))

    # include a feature of 1's, for first-order terms in quadratic
    ones = np.ones((n_pts, 1))
    X_plus_ones = np.concatenate([ones, X], axis=1)

    # random coefficient matrix
    coeffs = stats.uniform(-1, 2).rvs((n_features+1, n_features+1))

    y = (X_plus_ones.reshape(n_pts, n_features+1, 1) *
         coeffs *
         X_plus_ones.reshape(n_pts, 1, n_features+1)).sum(axis=(1, 2))
    y += stats.norm(0, noise).rvs(n_pts)
    return X, y

In [13]:
# get data
X, y = make_data(n_features=2, n_pts=300, noise=0.1)

# separate into training and test
X_train = X[5:]
y_train = y[5:]
X_test= X[:5]
y_test= y[:5]

# perform a KNN Regression using multiple distance functions
for f in [euclidean_distance, manhattan_distance, cosine_distance]:
    knn = KNNRegressor(k=3, distance=f)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print(f.__name__)
    print("Compare our predictions to the actual values. Are our predictions similar?")
    print("Predictions", y_pred)
    print("Actual", y_test)
    print('*' * 50)

euclidean_distance
Compare our predictions to the actual values. Are our predictions similar?
Predictions [0.06811675 0.24033638 0.04685559 0.04685559 0.04685559]
Actual [-0.00385062  0.74572908  0.18004239  1.02340981  0.33523842]
**************************************************
manhattan_distance
Compare our predictions to the actual values. Are our predictions similar?
Predictions [0.06811675 0.06811675 0.04685559 0.04685559 0.04685559]
Actual [-0.00385062  0.74572908  0.18004239  1.02340981  0.33523842]
**************************************************
cosine_distance
Compare our predictions to the actual values. Are our predictions similar?
Predictions [0.7550614  0.2638322  0.2638322  0.2638322  0.36563192]
Actual [-0.00385062  0.74572908  0.18004239  1.02340981  0.33523842]
**************************************************
