In [30]:
from sklearn.datasets import fetch_california_housing, make_friedman1
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import dnnr
import random

random.seed(42)
np.random.seed(42)

In [130]:
# Regression datasets with only numerical features from https://arxiv.org/pdf/2207.08815.pdf 
# (Why do tree-based models still outperform deep learning on tabular data?)
links = [
    {"url": "https://api.openml.org/data/download/22103266/dataset", "name": "Brazilian_houses"},
    {"url": "https://api.openml.org/data/download/22103261/dataset", "name":"wine"},
    {"url": "https://api.openml.org/data/download/22103262/dataset", "name": "ailerons"},
    {"url": "https://api.openml.org/data/download/22103263/dataset", "name": "houses"},
    {"url": "https://api.openml.org/data/download/22103264/dataset", "name": "house_16H"},
    {"url": "https://api.openml.org/data/download/22103267/dataset", "name": "Bike_Sharing_Demand"},
    {"url": "https://api.openml.org/data/download/22103268/dataset", "name": "nyc-taxi-green-dec-2016"},
    {"url": "https://api.openml.org/data/download/22103269/dataset", "name": "house_sales"},
    {"url": "https://api.openml.org/data/download/22103270/dataset", "name": "sulfur"},
    {"url": "https://api.openml.org/data/download/22103271/dataset", "name": "medical_charges"},
    {"url": "https://api.openml.org/data/download/22103272/dataset", "name": "MiamiHousing2016"},
    {"url": "https://api.openml.org/data/download/22103273/dataset", "name": "superconduct"},
    {"url": "", "name": "cpu"},
    {"url": "", "name": "diamond"},
    {"url": "", "name": "isolet"},
    {"url": "", "name": "pol"},
]

In [108]:
# Custom datasets
cpu = pd.read_csv("datasets/houses")
features = list(cpu.columns)[:-1]
labels = list(cpu.columns)[-1]
cpu_X = cpu[cpu.columns.intersection(features)]
cpu_y = cpu[cpu.columns.intersection([labels])]
X, y = cpu_X.to_numpy(), cpu_y.to_numpy().flatten()

In [43]:
# Scikit learn datasets
X, y = fetch_california_housing( return_X_y=True)

In [118]:
dn_sc = dnnr.DNNR(n_neighbors=3, n_derivative_neighbors=-1, scaling="learned")
dn_scaling = dn_sc._get_scaler()
standard_scaling = StandardScaler()

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)
# X_train = standard_scaling.fit_transform(X_train)
# X_test = standard_scaling.transform(X_test)

# X_dn_scaled_train = dn_scaling.fit_transform(X_train, y_train)
# X_dn_scaled_test = dn_scaling.transform(X_test)

In [16]:
neigh_r.fit(X_train, y_train)
print(mean_squared_error(y_test, neigh_r.predict(X_test)))

dn.fit(X_train, y_train)
print(mean_squared_error(y_test, dn.predict(X_test)))

neigh_r.fit(X_dn_scaled_train, y_train)
print(mean_squared_error(y_test, neigh_r.predict(X_dn_scaled_test)))

# dn = dnnr.DNNR(n_neighbors=k,n_derivative_neighbors=k_s, scaling=None, solver="linear_regression")
dn = dnnr.DNNR(n_neighbors=k,n_derivative_neighbors=k_s, scaling=None, solver="scipy_lsqr")
dn.fit(X_dn_scaled_train, y_train)
print(mean_squared_error(y_test, dn.predict(X_dn_scaled_test)))

# dn_sc = dnnr.DNNR(n_neighbors=k, n_derivative_neighbors=k_s, scaling="learned", solver="scipy_lsqr")
# dn_sc.fit(X_train, y_train)
# print(mean_squared_error(y_test, dn_sc.predict(X_test)))


0.39457480771273146
0.42847062197283137
0.30678436347206606
0.34693216951626576


In [112]:
# kNN no learned scaling
neigh_r = KNeighborsRegressor(n_neighbors=5)
reg = make_pipeline(standard_scaling, neigh_r)
scores = cross_val_score(reg, X, y, scoring='r2', cv=10, n_jobs=8)
scores.mean(), scores.std()

(0.5109854086585075, 0.20642617904547333)

In [113]:
# kNN with learned scaling
neigh_r = KNeighborsRegressor(n_neighbors=5)
reg = make_pipeline(standard_scaling, dn_scaling, neigh_r)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=8)
scores.mean(), scores.std()

(0.7809106676064816, 0.010070034496825595)

In [114]:
# knnt no learned scaling
dn_sc = dnnr.DNNR(n_neighbors=5, n_derivative_neighbors=32, scaling=None, solver="scipy_lsqr", order="1")
reg = make_pipeline(standard_scaling, dn_sc)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
scores.mean(), scores.std()

(0.7365074087090699, 0.02171473832580501)

In [116]:
# knnt with learned scaling
dn_sc = dnnr.DNNR(n_neighbors=10, n_derivative_neighbors=64, scaling="learned", solver="scipy_lsqr", order="1")
reg = make_pipeline(standard_scaling, dn_sc)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
scores.mean(), scores.std()

(0.8081242083660628, 0.011242656235956244)

Перебор гиперпараметров для каждого датасе с записью в виде csv-файлов.

In [132]:
# kNN
with open("./results/knn.csv", "w") as f:
    # first line in csv file
    f.write("dataset,k,weights,scaling,metric,mean,std\n")
    # iterate over datasets
    for dataset in links:
        # read data
        data = pd.read_csv(f"./datasets/{dataset['name']}")
        print(f"working with {dataset['name']}")
        features = list(data.columns)[:-1]
        labels = list(data.columns)[-1]
        data_X = data[data.columns.intersection(features)]
        data_y = data[data.columns.intersection([labels])]
        X, y = data_X.to_numpy(), data_y.to_numpy().flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)
        # as in DNNR paper vary hyperparameters according to the size of dataset
        # small
        if X.shape[0] < 2000:
            ks = [2,5,7,10,20,30,40,50]
        # medium
        elif X.shape[0] < 50000:
            ks = [2,5,7,10,25,50,100,250]
        # large
        else:
            ks = [2,3,5,7,10,12,15,20,25]

        for k in ks:
            for weights in ["uniform","distance"]:
                for scaling in [0, 1]:
                    for metric in ["r2"]:
                        model = KNeighborsRegressor(n_neighbors=k, weights=weights)
                        if scaling:
                            reg = make_pipeline(standard_scaling, dn_scaling, model)
                            scores = cross_val_score(reg, X_train, y_train, scoring=metric, cv=10, n_jobs=4)  
                        else:
                            reg = make_pipeline(standard_scaling, model)
                            scores = cross_val_score(reg, X_train, y_train, scoring=metric, cv=10, n_jobs=4)
                        mean, std = scores.mean(), scores.std()
                        print(f"{dataset['name']},{k},{weights},{scaling},{metric},{mean},{std}")
                        f.write(f"{dataset['name']},{k},{weights},{scaling},{metric},{mean},{std}\n")


working with Brazilian_houses


Brazilian_houses,2,uniform,0,r2,0.9703418865601126,0.007266711256378761
Brazilian_houses,2,uniform,1,r2,0.974232482775378,0.007644723897230482
Brazilian_houses,2,distance,0,r2,0.9716701020922024,0.007238237337103632
Brazilian_houses,2,distance,1,r2,0.9754517254577699,0.0073805575126763365
Brazilian_houses,5,uniform,0,r2,0.9690528568013992,0.006465298934114306
Brazilian_houses,5,uniform,1,r2,0.9724202474780803,0.005616382221060254
Brazilian_houses,5,distance,0,r2,0.9719761128656254,0.006473245448935654
Brazilian_houses,5,distance,1,r2,0.9750389443658685,0.005039487524790554
Brazilian_houses,7,uniform,0,r2,0.9668683199383296,0.006372832907370789
Brazilian_houses,7,uniform,1,r2,0.9705376375013716,0.00640873575863849
Brazilian_houses,7,distance,0,r2,0.9705717575471293,0.006526238331092301
Brazilian_houses,7,distance,1,r2,0.9738514735026064,0.0055126877633164565
Brazilian_houses,10,uniform,0,r2,0.9634685972707231,0.006097757926437906
Brazilian_houses,10,uniform,1,r2,0.9672651129393172,0.006



nyc-taxi-green-dec-2016,2,uniform,1,r2,0.4010885464528983,0.053936416638321306
nyc-taxi-green-dec-2016,2,distance,0,r2,0.23364635981107879,0.009736962561754308
nyc-taxi-green-dec-2016,2,distance,1,r2,0.3859316913626274,0.0502000699024401
nyc-taxi-green-dec-2016,3,uniform,0,r2,0.3067994717098285,0.008888961512864162
nyc-taxi-green-dec-2016,3,uniform,1,r2,0.4580064263816889,0.04572253217533268
nyc-taxi-green-dec-2016,3,distance,0,r2,0.3062601571787982,0.008849862682999105
nyc-taxi-green-dec-2016,3,distance,1,r2,0.44147847237920645,0.041167288873993686
nyc-taxi-green-dec-2016,5,uniform,0,r2,0.3605334331351478,0.007538199836036056
nyc-taxi-green-dec-2016,5,uniform,1,r2,0.4996520391088938,0.03838610911642473
nyc-taxi-green-dec-2016,5,distance,0,r2,0.36270904842970275,0.007580271555488208
nyc-taxi-green-dec-2016,5,distance,1,r2,0.4864385087218384,0.03386488688369527
nyc-taxi-green-dec-2016,7,uniform,0,r2,0.38235196547910655,0.007828393695633047
nyc-taxi-green-dec-2016,7,uniform,1,r2,0.513774



medical_charges,7,distance,1,r2,0.9745264443146489,0.0017340822400739324
medical_charges,10,uniform,0,r2,0.9770348363898604,0.0006929731348219849
medical_charges,10,uniform,1,r2,0.9767273168280605,0.0012628116591326098
medical_charges,10,distance,0,r2,0.9766140933050302,0.0007180470346989068
medical_charges,10,distance,1,r2,0.9754898622669748,0.001753061109633953
medical_charges,12,uniform,0,r2,0.9773192192843958,0.0006655744505030666
medical_charges,12,uniform,1,r2,0.9770098284159683,0.0013305320794957307
medical_charges,12,distance,0,r2,0.9769782544446406,0.0007014641156781351
medical_charges,12,distance,1,r2,0.9758484836774814,0.001755598050970575
medical_charges,15,uniform,0,r2,0.9775770087910145,0.0006451663374448573
medical_charges,15,uniform,1,r2,0.9773149795437313,0.0013848590748176872
medical_charges,15,distance,0,r2,0.9773282056749183,0.0006817278077095234
medical_charges,15,distance,1,r2,0.9762314423305858,0.001750544335509429
medical_charges,20,uniform,0,r2,0.97777200109246

KeyboardInterrupt: 