In [12]:
from sklearn.datasets import fetch_california_housing, make_friedman1
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import dnnr
import catboost
import random

random.seed(42)
np.random.seed(42)

In [5]:
# Regression datasets with only numerical features from https://arxiv.org/pdf/2207.08815.pdf 
# (Why do tree-based models still outperform deep learning on tabular data?)
links = [
    {"url": "https://api.openml.org/data/download/22103266/dataset", "name": "Brazilian_houses"},
    {"url": "https://api.openml.org/data/download/22103261/dataset", "name":"wine"},
    {"url": "https://api.openml.org/data/download/22103262/dataset", "name": "ailerons"},
    {"url": "https://api.openml.org/data/download/22103263/dataset", "name": "houses"},
    {"url": "https://api.openml.org/data/download/22103264/dataset", "name": "house_16H"},
    {"url": "https://api.openml.org/data/download/22103267/dataset", "name": "Bike_Sharing_Demand"},
    {"url": "https://api.openml.org/data/download/22103268/dataset", "name": "nyc-taxi-green-dec-2016"},
    {"url": "https://api.openml.org/data/download/22103269/dataset", "name": "house_sales"},
    {"url": "https://api.openml.org/data/download/22103270/dataset", "name": "sulfur"},
    {"url": "https://api.openml.org/data/download/22103271/dataset", "name": "medical_charges"},
    {"url": "https://api.openml.org/data/download/22103272/dataset", "name": "MiamiHousing2016"},
    {"url": "https://api.openml.org/data/download/22103273/dataset", "name": "superconduct"},
    {"url": "", "name": "cpu"},
    {"url": "", "name": "diamond"},
    {"url": "", "name": "isolet"},
    {"url": "", "name": "pol"},
]

In [108]:
# Custom datasets
cpu = pd.read_csv("datasets/houses")
features = list(cpu.columns)[:-1]
labels = list(cpu.columns)[-1]
cpu_X = cpu[cpu.columns.intersection(features)]
cpu_y = cpu[cpu.columns.intersection([labels])]
X, y = cpu_X.to_numpy(), cpu_y.to_numpy().flatten()

In [43]:
# Scikit learn datasets
X, y = fetch_california_housing( return_X_y=True)

In [6]:
dn_sc = dnnr.DNNR(n_neighbors=3, n_derivative_neighbors=-1, scaling="learned")
dn_scaling = dn_sc._get_scaler()
standard_scaling = StandardScaler()

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)
# X_train = standard_scaling.fit_transform(X_train)
# X_test = standard_scaling.transform(X_test)

# X_dn_scaled_train = dn_scaling.fit_transform(X_train, y_train)
# X_dn_scaled_test = dn_scaling.transform(X_test)

In [16]:
neigh_r.fit(X_train, y_train)
print(mean_squared_error(y_test, neigh_r.predict(X_test)))

dn.fit(X_train, y_train)
print(mean_squared_error(y_test, dn.predict(X_test)))

neigh_r.fit(X_dn_scaled_train, y_train)
print(mean_squared_error(y_test, neigh_r.predict(X_dn_scaled_test)))

# dn = dnnr.DNNR(n_neighbors=k,n_derivative_neighbors=k_s, scaling=None, solver="linear_regression")
dn = dnnr.DNNR(n_neighbors=k,n_derivative_neighbors=k_s, scaling=None, solver="scipy_lsqr")
dn.fit(X_dn_scaled_train, y_train)
print(mean_squared_error(y_test, dn.predict(X_dn_scaled_test)))

# dn_sc = dnnr.DNNR(n_neighbors=k, n_derivative_neighbors=k_s, scaling="learned", solver="scipy_lsqr")
# dn_sc.fit(X_train, y_train)
# print(mean_squared_error(y_test, dn_sc.predict(X_test)))


0.39457480771273146
0.42847062197283137
0.30678436347206606
0.34693216951626576


In [112]:
# kNN no learned scaling
neigh_r = KNeighborsRegressor(n_neighbors=5)
reg = make_pipeline(standard_scaling, neigh_r)
scores = cross_val_score(reg, X, y, scoring='r2', cv=10, n_jobs=8)
scores.mean(), scores.std()

(0.5109854086585075, 0.20642617904547333)

In [113]:
# kNN with learned scaling
neigh_r = KNeighborsRegressor(n_neighbors=5)
reg = make_pipeline(standard_scaling, dn_scaling, neigh_r)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=8)
scores.mean(), scores.std()

(0.7809106676064816, 0.010070034496825595)

In [114]:
# knnt no learned scaling
dn_sc = dnnr.DNNR(n_neighbors=5, n_derivative_neighbors=32, scaling=None, solver="scipy_lsqr", order="1")
reg = make_pipeline(standard_scaling, dn_sc)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
scores.mean(), scores.std()

(0.7365074087090699, 0.02171473832580501)

In [116]:
# knnt with learned scaling
dn_sc = dnnr.DNNR(n_neighbors=10, n_derivative_neighbors=64, scaling="learned", solver="scipy_lsqr", order="1")
reg = make_pipeline(standard_scaling, dn_sc)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
scores.mean(), scores.std()

(0.8081242083660628, 0.011242656235956244)

Перебор гиперпараметров для каждого датасе с записью в виде csv-файлов.

In [None]:
# kNN
# Regression datasets with only numerical features from https://arxiv.org/pdf/2207.08815.pdf 
# (Why do tree-based models still outperform deep learning on tabular data?)
links = [
    {"url": "https://api.openml.org/data/download/22103266/dataset", "name": "Brazilian_houses"},
    {"url": "https://api.openml.org/data/download/22103261/dataset", "name":"wine"},
    {"url": "https://api.openml.org/data/download/22103262/dataset", "name": "ailerons"},
    {"url": "https://api.openml.org/data/download/22103263/dataset", "name": "houses"},
    {"url": "https://api.openml.org/data/download/22103264/dataset", "name": "house_16H"},
    {"url": "https://api.openml.org/data/download/22103267/dataset", "name": "Bike_Sharing_Demand"},
    # {"url": "https://api.openml.org/data/download/22103268/dataset", "name": "nyc-taxi-green-dec-2016"},
    {"url": "https://api.openml.org/data/download/22103269/dataset", "name": "house_sales"},
    {"url": "https://api.openml.org/data/download/22103270/dataset", "name": "sulfur"},
    {"url": "https://api.openml.org/data/download/22103271/dataset", "name": "medical_charges"},
    {"url": "https://api.openml.org/data/download/22103272/dataset", "name": "MiamiHousing2016"},
    {"url": "https://api.openml.org/data/download/22103273/dataset", "name": "superconduct"},
    {"url": "", "name": "cpu"},
    {"url": "", "name": "diamond"},
    # {"url": "", "name": "isolet"},
    {"url": "", "name": "pol"},
]

with open("./results/knn.csv", "a+") as f:
    # first line in csv file
    f.write("dataset,k,weights,scaling,metric,mean,std\n")
    # iterate over datasets
    for dataset in links[:]:
        # read data
        data = pd.read_csv(f"./datasets/{dataset['name']}")
        print(f"working with {dataset['name']}")
        features = list(data.columns)[:-1]
        labels = list(data.columns)[-1]
        data_X = data[data.columns.intersection(features)]
        data_y = data[data.columns.intersection([labels])]
        X, y = data_X.to_numpy(), data_y.to_numpy().flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)
        # as in DNNR paper vary hyperparameters according to the size of dataset
        # small
        if X.shape[0] < 2000:
            ks = [2,5,7,10,20,30,40,50]
        # medium
        elif X.shape[0] < 50000:
            ks = [2,5,7,10,25,50,100,250]
        # large
        else:
            ks = [2,3,5,7,10,12,15,20,25]

        for k in ks:
            for weights in ["uniform","distance"]:
                for scaling in [0, 1]:
                    for metric in ["neg_mean_squared_error"]:
                        model = KNeighborsRegressor(n_neighbors=k, weights=weights)
                        if scaling:
                            reg = make_pipeline(standard_scaling, dn_scaling, model)
                            scores = cross_val_score(reg, X_train, y_train, scoring=metric, cv=10, n_jobs=2)  
                        else:
                            reg = make_pipeline(standard_scaling, model)
                            scores = cross_val_score(reg, X_train, y_train, scoring=metric, cv=10, n_jobs=2)
                        mean, std = -scores.mean(), scores.std()
                        print(f"{dataset['name']},{k},{weights},{scaling},{metric},{mean},{std}")
                        f.write(f"{dataset['name']},{k},{weights},{scaling},{metric},{mean},{std}\n")


working with pol
pol,2,uniform,0,r2,0.9298270754808513,0.00801976044102319
pol,2,uniform,1,r2,0.8843016624425836,0.05528711992750621
pol,2,distance,0,r2,0.931950395518407,0.007928304530536822
pol,2,distance,1,r2,0.8874577967852666,0.05422164702566613
pol,5,uniform,0,r2,0.9318412898260592,0.0059983803852474244
pol,5,uniform,1,r2,0.8865350385639819,0.05217492619127895
pol,5,distance,0,r2,0.9366703767094255,0.005847217678346791
pol,5,distance,1,r2,0.8938303190357605,0.049691523885921886
pol,7,uniform,0,r2,0.930084365077208,0.006102969552374869
pol,7,uniform,1,r2,0.8809791919407319,0.05541506913959745
pol,7,distance,0,r2,0.935693284954738,0.0059067021331935515
pol,7,distance,1,r2,0.8904634873626446,0.05168675695401286
pol,10,uniform,0,r2,0.9264364203133428,0.0068278003210306805
pol,10,uniform,1,r2,0.8738440532854617,0.059610110191960664
pol,10,distance,0,r2,0.9329664783990225,0.006544592534932005
pol,10,distance,1,r2,0.8851620576655342,0.054737606039159156
pol,25,uniform,0,r2,0.90904567792

In [18]:
# catboost

# Regression datasets with only numerical features from https://arxiv.org/pdf/2207.08815.pdf 
# (Why do tree-based models still outperform deep learning on tabular data?)
links = [
    {"url": "https://api.openml.org/data/download/22103266/dataset", "name": "Brazilian_houses"},
    {"url": "https://api.openml.org/data/download/22103261/dataset", "name":"wine"},
    {"url": "https://api.openml.org/data/download/22103262/dataset", "name": "ailerons"},
    {"url": "https://api.openml.org/data/download/22103263/dataset", "name": "houses"},
    {"url": "https://api.openml.org/data/download/22103264/dataset", "name": "house_16H"},
    {"url": "https://api.openml.org/data/download/22103267/dataset", "name": "Bike_Sharing_Demand"},
    # {"url": "https://api.openml.org/data/download/22103268/dataset", "name": "nyc-taxi-green-dec-2016"},
    {"url": "https://api.openml.org/data/download/22103269/dataset", "name": "house_sales"},
    {"url": "https://api.openml.org/data/download/22103270/dataset", "name": "sulfur"},
    {"url": "https://api.openml.org/data/download/22103271/dataset", "name": "medical_charges"},
    {"url": "https://api.openml.org/data/download/22103272/dataset", "name": "MiamiHousing2016"},
    {"url": "https://api.openml.org/data/download/22103273/dataset", "name": "superconduct"},
    {"url": "", "name": "cpu"},
    {"url": "", "name": "diamond"},
    {"url": "", "name": "isolet"},
    {"url": "", "name": "pol"},
]

# preprocessing
standard_scaling = StandardScaler()

# kNNt
with open("./results/catboost.csv", "a+") as f:
    # first line in csv file
    f.write("dataset,lr,max_depth,n_estimators,metric,mean,std\n")
    # iterate over datasets
    for dataset in links[-1:]:
        # read data
        data = pd.read_csv(f"./datasets/{dataset['name']}")
        print(f"working with {dataset['name']}")
        features = list(data.columns)[:-1]
        labels = list(data.columns)[-1]
        data_X = data[data.columns.intersection(features)]
        data_y = data[data.columns.intersection([labels])]
        X, y = data_X.to_numpy(), data_y.to_numpy().flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)
        # as in DNNR paper vary hyperparameters according to the size of dataset

        for lr in [0.001,0.01,0.1,0.3]:
            for max_depth in [3,5,10]:
                for n_estimators in [50,100,500,1000]:
                    # for metric in ["r2"]:
                    model = catboost.CatBoostRegressor(verbose=False, learning_rate=lr, max_depth=max_depth, n_estimators=n_estimators)
                    reg = make_pipeline(standard_scaling, model)
                    scores = cross_validate(reg, X_train, y_train, scoring=("r2", "neg_mean_squared_error"), cv=10, n_jobs=1)  
                    mean_r2, std_r2 = scores['test_r2'].mean(), scores['test_r2'].std()
                    mean_mse, std_mse = -scores['test_neg_mean_squared_error'].mean(), scores['test_neg_mean_squared_error'].std()
                    print(f"{dataset['name']},{lr},{max_depth},{n_estimators},r2,{mean_r2},{std_r2}")
                    print(f"{dataset['name']},{lr},{max_depth},{n_estimators},mse,{mean_mse},{std_mse}")
                    f.write(f"{dataset['name']},{lr},{max_depth},{n_estimators},r2,{mean_r2},{std_r2}\n")
                    f.write(f"{dataset['name']},{lr},{max_depth},{n_estimators},mse,{mean_mse},{std_mse}\n")

working with pol
pol,0.001,3,50,r2,0.025564772724568384,0.0014856799853414887
pol,0.001,3,50,mse,1694.2548918133737,47.88480514162721
pol,0.001,3,100,r2,0.050026524259107544,0.0014565977465054575
pol,0.001,3,100,mse,1651.7189781787524,46.52946819619643
pol,0.001,3,500,r2,0.18742104106447324,0.002897194968343337
pol,0.001,3,500,mse,1412.7615386502293,37.50303677690118
pol,0.001,3,1000,r2,0.27594991315599005,0.0049924589717524155
pol,0.001,3,1000,mse,1258.732455125133,29.862637971547265
pol,0.001,5,50,r2,0.03418112064608551,0.001468130575626802
pol,0.001,5,50,mse,1679.2774744257956,47.59417813408519
pol,0.001,5,100,r2,0.0662791943346033,0.0015477821774392228
pol,0.001,5,100,mse,1623.4671309280754,45.97851588588881
pol,0.001,5,500,r2,0.24876362569418617,0.0032738599110238354
pol,0.001,5,500,mse,1306.1156205742984,35.00852369313639
pol,0.001,5,1000,r2,0.3718246753887972,0.005542761252014894
pol,0.001,5,1000,mse,1092.0196691496071,24.97833357354331
pol,0.001,10,50,r2,0.05246257245729143,0.0

In [190]:
data = pd.read_csv(f"./datasets/medical_charges")
print(f"working with medical_charges")
features = list(data.columns)[:-1]
labels = list(data.columns)[-1]
data_X = data[data.columns.intersection(features)]
data_y = data[data.columns.intersection([labels])]
X, y = data_X.to_numpy(), data_y.to_numpy().flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

# knnt no learned scaling
dn_sc = dnnr.DNNR(n_neighbors=20, n_derivative_neighbors=40, scaling='learned', solver="scipy_lsqr", order="1")
reg = make_pipeline(standard_scaling, dn_sc)
scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
print(scores.mean(), scores.std())

# # knnt learned scaling
# dn_sc = dnnr.DNNR(n_neighbors=5, n_derivative_neighbors=24, scaling="learned", solver="scipy_lsqr", order="1")
# reg = make_pipeline(standard_scaling, dn_sc)
# scores = cross_val_score(reg, X_train, y_train, scoring='r2', cv=10, n_jobs=1)
# print(scores.mean(), scores.std())

# # kNN with learned scaling
# neigh_r = KNeighborsRegressor(n_neighbors=2)
# reg = make_pipeline(standard_scaling, dn_scaling, neigh_r)
# scores_knn = cross_val_score(reg, X_train, y_train, scoring='r2', cv=20, n_jobs=2)
# print(scores_knn, scores_knn.mean(), scores_knn.std())


working with medical_charges
0.9772675908071864 0.0008128483602677805


In [189]:
scores

array([0.9771315 , 0.97792346, 0.97566964, 0.97716689, 0.97712359,
       0.97730052, 0.97609175, 0.97700543, 0.97734594, 0.97719967])