In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.datasets import fetch_california_housing
import seaborn
from gower import gower_matrix
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from functools import lru_cache


class KNNRegression:
    def __init__(self, df: pd.DataFrame, target: str, random_state=0, test_size=0.2):
        self.y = df[target]
        self.X = df.drop(target, axis=1)
        self.random_state = random_state
        self.test_size = test_size
        self.X_train, self.X_test, y_train, y_test = train_test_split(
            self.X, self.y, random_state=random_state, test_size=test_size
        )
        self.y_train = y_train.to_numpy()
        self.y_test = y_test.to_numpy()

    def get_gower_predicts(self, X, k=5):
        gower_m = gower_matrix(X, self.X_train)

        predicts = []
        for row in range(len(X)):
            near_indices = np.argsort(gower_m[row])[:k]
            near_distances = gower_m[row][near_indices]
            near_targets = self.y_train[near_indices]
            predicts.append(np.average(near_targets))

        return predicts

    def score_knn_gower(self, min=5, max=20):
        for k in range(min, max+1):
            y_pred = self.get_gower_predicts(self.X_test, k=k)
            print("%d Nearest Neighbors model with Gower distance R2: %f"%(k, r2_score(self.y_test, y_pred)))
            print("Mean squared error: %f"%mean_squared_error(self.y_test, y_pred))
            print("Mean absolute error: %f"%mean_absolute_error(self.y_test, y_pred))

    def fit_predict_knn(self, X, X_train, k=5):
        knn = KNeighborsRegressor(n_neighbors=k).fit(X_train, self.y_train)
        return knn.predict(X)

    def score_knn(self, min=5, max=20):
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)
        for k in range(min, max+1):
            y_pred = self.fit_predict_knn(X_test_scaled, X_train_scaled, k=k)
            print("%d Nearest Neighbors model R2: %f"%(k, r2_score(self.y_test, y_pred)))
            print("Mean squared error: %f"%mean_squared_error(self.y_test, y_pred))
            print("Mean absolute error: %f"%mean_absolute_error(self.y_test, y_pred))


class KNNRegressionWeightedGower(KNNRegression):
    def __init__(self, df: pd.DataFrame, target: str, random_state=0, test_size=0.2):
        super().__init__(df, target, random_state, test_size)

    def get_weights(self, distances):
        inverted_distances = [1/d for d in distances]
        weight_sum = sum(inverted_distances)
        weights = [d/weight_sum for d in inverted_distances]
        return weights

    def get_predicts(self, X, k):
        gower_m = gower_matrix(X, self.X_train)

        predicts = []
        for row in range(len(X)):
            near_indices = np.argsort(gower_m[row])[:k]
            near_distances = gower_m[row][near_indices]
            near_targets = self.y_train[near_indices]
            weights = self.get_weights(near_distances)
            predicts.append(np.dot(weights, near_targets))

        return predicts

    def get_test_predicts(self, k):
        return self.get_predicts(self.X_test, k)

    def get_test_score(self, min=5, max=20):
        for k in range(min, max+1):
            y_pred = self.get_test_predicts(k)
            score = r2_score(self.y_test, y_pred)
            print("%d NN model with weighted Gower distance R2: %f"%(k, score))
            print("Mean squared error: %f"%mean_squared_error(self.y_test, y_pred))
            print("Mean absolute error: %f"%mean_absolute_error(self.y_test, y_pred))


def main():
    housing = fetch_california_housing(as_frame=True)
    df = housing.frame
    df['AveRoom-to-occup'] = (df['AveRooms'] + df['AveBedrms'])/df['AveOccup']
    print(df.head(5))
    print("num data points: %d\n"%len(df.index))
    '''alphas = np.logspace(-3, 1, num=30)
    model = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))
    cv_results = cross_validate(
        model,
        housing.data,
        housing.target,
        return_estimator=True,
        n_jobs=2,
    )
    score = cv_results["test_score"]
    print(f"R2 score: {score.mean():.3f} ± {score.std():.3f}")
    coefs = pd.DataFrame(
        [est[-1].coef_ for est in cv_results["estimator"]],
        columns=housing.feature_names,
    )
    color = {"whiskers": "black", "medians": "black", "caps": "black"}
    coefs.plot.box(vert=False, color=color)
    plt.axvline(x=0, ymin=-1, ymax=1, color="black", linestyle="--")
    plt.title("Coefficients of Ridge models\n via cross-validation")'''

    knn = KNNRegression(df, 'MedHouseVal')
    knn.score_knn(11, 11)
    print()
    knn.score_knn_gower(11, 11)
    print()

    knn_weighted = KNNRegressionWeightedGower(df, 'MedHouseVal')
    knn_weighted.get_test_score(11, 11)

main()

   MedInc  HouseAge  Population  Latitude  Longitude  MedHouseVal  \
0  8.3252      41.0       322.0     37.88    -122.23        4.526   
1  8.3014      21.0      2401.0     37.86    -122.22        3.585   
2  7.2574      52.0       496.0     37.85    -122.24        3.521   
3  5.6431      52.0       558.0     37.85    -122.25        3.413   
4  3.8462      52.0       565.0     37.85    -122.25        3.422   

   AveRoom-to-occup  
0          3.133540  
1          3.417326  
2          3.340726  
3          2.704301  
4          3.375221  
num data points: 20640

11 Nearest Neighbors model R2: 0.719158
Mean squared error: 0.366205
Mean absolute error: 0.411710

11 Nearest Neighbors model with Gower distance R2: 0.742028
Mean squared error: 0.336384
Mean absolute error: 0.388202

11 NN model with weighted Gower distance R2: 0.749689
Mean squared error: 0.326395
Mean absolute error: 0.380319
