# Large scale learning with kernels

1. [Imports](#imports)
2. [Experiments on synthetic data](#synthetic)
    1. [Performance with D](#synt_D)
    2. [Influence of the regularization](#synt_reg)
3. [Experiments on the CT Slices dataset](#CT)
    1. [Performance with D](#CT_D)
    2. [Influence of the regularization](#CT_reg)

## Imports <a name="imports"></a>

In [124]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from time import time
import matplotlib.pyplot as plt
import copy
from rkhs.rff import RFF
from rkhs.nystrom import PlainNystromRegressor, SGDPlainNystromRegressor, SGDPlainNystromClassifier
from tqdm import tqdm
np.random.seed(42)

# Download / Generate Data

In [2]:
from google_drive_downloader import GoogleDriveDownloader as gdd
import os

DATA = {
    "YearPredictionMSD.txt.zip": "1R3CXYssjftxi7HIXXgeWCAZh65ByEumt",
    "slice_localization_data.zip": "1ZJamggugQuj-sE1EFWfghhR3dEY2K9Ib",
}

for file_name, file_id in DATA.items():
    output_file = os.path.join('data', file_name)
    if not os.path.isfile(output_file):
        gdd.download_file_from_google_drive(file_id=file_id, dest_path=output_file)

In [None]:
os.makedirs("synthetic_clf_results/", exist_ok=True) 
os.makedirs("CT_results/", exist_ok=True) 

# Experiments on synthetic data: classification <a name="synthetic"></a>


In [56]:
X, y = make_classification(n_samples=50000, n_features=120, n_informative=15, n_classes=2, random_state=17)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

### Baseline
#### Linear SVC

In [154]:
lin = Pipeline([('Scaler',StandardScaler()),
                ('LinearSVC', LinearSVC(dual=False)),])
t0 = time()
lin.fit(X_train, y_train)
time_lin = time() - t0

y_pred_train_lin = lin.predict(X_train)                      
score_train_lin = accuracy_score(y_train, y_pred_train_lin)

y_pred_lin = lin.predict(X_test)                      
score_lin = accuracy_score(y_test, y_pred_lin)

#### SVC

In [155]:
gauss = Pipeline([('Scaler',StandardScaler()),
                  ('SVC', SVC(kernel='rbf')),])

t0 = time()
gauss.fit(X_train, y_train)
time_gauss = time() - t0

y_pred_train_gauss = gauss.predict(X_train)                      
score_train_gauss = accuracy_score(y_train, y_pred_train_gauss)

y_pred_gauss = gauss.predict(X_test)                      
score_gauss = accuracy_score(y_test, y_pred_gauss)

In [157]:
np.savetxt('synthetic_clf_results/score_lin.csv', np.asarray([score_lin]), delimiter=',')
np.savetxt('synthetic_clf_results/score_train_lin.csv', np.asarray([score_train_lin]), delimiter=',')
np.savetxt('synthetic_clf_results/time_lin.csv', np.asarray([time_lin]), delimiter=',')
np.savetxt('synthetic_clf_results/score_gauss.csv', np.asarray([score_gauss]), delimiter=',')
np.savetxt('synthetic_clf_results/score_train_gauss.csv', np.asarray([score_train_gauss]), delimiter=',')
np.savetxt('synthetic_clf_results/time_gauss.csv', np.asarray([time_gauss]), delimiter=',')

## Evaluation of performances according to dimension (dimension and subsampling size) <a name="synt_D"></a>


### Random Fourier Features

In [158]:
nD = 10
list_D = np.linspace(300,3500,nD,dtype=int)
M = 10
scores_rff = np.empty((nD,M))
scores_train_rff = np.empty((nD,M))
times_rff = np.empty((nD,M))

for i,D in enumerate(tqdm(list_D)):
    for j in range(M):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = D)),
                        ('LinearSVC', LinearSVC(dual=False)),])  
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff[i,j] = time() - t0
        
        y_pred_train_rff = rff.predict(X_train)                      
        scores_train_rff[i,j] = accuracy_score(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        scores_rff[i,j] = accuracy_score(y_test, y_pred_rff)

In [159]:
np.savetxt('synthetic_clf_results/scores_rff.csv', scores_rff, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff.csv', scores_train_rff, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff.csv', times_rff, delimiter=',')
np.savetxt('synthetic_clf_results/list_D.csv', list_D, delimiter=',')

### Nystrom

In [61]:
scores_nys = np.empty((nD,M))
scores_train_nys = np.empty((nD,M))
times_nys = np.empty((nD,M))

for i,D in enumerate(tqdm(list_D)):
    for j in range(M):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('Nystrom', SGDPlainNystromClassifier(m=D)),])
        
        t0 = time()
        nys.fit(X_train, y_train)
        times_nys[i,j] = time() - t0
        
        y_pred_train_nys = nys.predict(X_train)                      
        scores_train_nys[i,j] = accuracy_score(y_train, y_pred_train_nys)
        
        y_pred_nys = nys.predict(X_test)                      
        scores_nys[i,j] = accuracy_score(y_test, y_pred_nys)

100%|██████████| 10/10 [12:52<00:00, 77.28s/it] 


In [68]:
np.savetxt('synthetic_clf_results/scores_nys.csv', scores_nys, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_nys.csv', scores_train_nys, delimiter=',')
np.savetxt('synthetic_clf_results/times_nys.csv', times_nys, delimiter=',')

### Random Fourier Features l1 loss l2 penalty

In [None]:
scores_rff_h_d = np.empty((nD,nb_iter))
scores_train_rff_h_d = np.empty((nD,nb_iter))
times_rff_h_d = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = D)),
                        ('LinearSVC', LinearSVC(loss='hinge')),])

        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_h_d[i,j] = time() - t0

        y_pred_train_rff = rff.predict(X_train)                      
        scores_train_rff_h_d[i,j] = accuracy_score(y_train, y_pred_train_rff)

        y_pred_rff = rff.predict(X_test)                      
        scores_rff_h_d[i,j] = accuracy_score(y_test, y_pred_rff)

In [None]:
np.savetxt('synthetic_clf_results/scores_rff_h_d.csv', scores_rff_h_d, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff_h_d.csv', scores_train_rff_h_d, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff_h_d.csv', times_rff_h_d, delimiter=',')

In [11]:
print("SVC with gaussian kernel execution time:", time_gauss)

SVC with gaussian kernel execution time: 242.52361536026


### Influence of the regularization $\lambda$  <a name="synt_reg"></a>

In [78]:
nC = 8
list_C = np.linspace(0.01,2,nC)
M = 10

### Linear SVC l2 loss l2 penalty

In [46]:
scores_lin_sh_l2 = np.empty((nC,1))
scores_train_lin_sh_l2 = np.empty((nC,1))
times_lin_sh_l2 = np.empty((nC,1))

for i,C in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVC', LinearSVC(dual=False, C=C)),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin_sh_l2[i] = time() - t0

    y_pred_train_lin = lin.predict(X_train)                      
    scores_train_lin_sh_l2[i] = accuracy_score(y_train, y_pred_train_lin)

    y_pred_lin = lin.predict(X_test)                      
    scores_lin_sh_l2[i] = accuracy_score(y_test, y_pred_lin)



































































  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 12%|█▎        | 1/8 [00:00<00:03,  2.09it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 25%|██▌       | 2/8 [00:00<00:02,  2.13it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 38%|███▊      

In [47]:
np.savetxt('synthetic_clf_results/scores_lin_sh_l2.csv', scores_lin_sh_l2, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_lin_sh_l2.csv', scores_train_lin_sh_l2, delimiter=',')
np.savetxt('synthetic_clf_results/times_lin_sh_l2.csv', times_lin_sh_l2, delimiter=',')

### Linear SVC l2 loss l1 penalty

In [48]:
scores_lin_sh_l1 = np.empty((nC,1))
scores_train_lin_sh_l1 = np.empty((nC,1))
times_lin_sh_l1 = np.empty((nC,1))

for i,C in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVC', LinearSVC(dual=False, C=C, penalty='l1')),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin_sh_l1[i] = time() - t0

    y_pred_train_lin = lin.predict(X_train)                      
    scores_train_lin_sh_l1[i] = accuracy_score(y_train, y_pred_train_lin)

    y_pred_lin = lin.predict(X_test)                      
    scores_lin_sh_l1[i] = accuracy_score(y_test, y_pred_lin)



































































  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 12%|█▎        | 1/8 [00:01<00:12,  1.72s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 25%|██▌       | 2/8 [00:07<00:17,  2.87s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

































































 38%|███▊      

In [49]:
np.savetxt('synthetic_clf_results/scores_lin_sh_l1.csv', scores_lin_sh_l1, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_lin_sh_l1.csv', scores_train_lin_sh_l1, delimiter=',')
np.savetxt('synthetic_clf_results/times_lin_sh_l1.csv', times_lin_sh_l1, delimiter=',')

### Linear SVC l1 loss

In [50]:
scores_lin_h_l2 = np.empty((nC,1))
scores_train_lin_h_l2 = np.empty((nC,1))
times_lin_h_l2 = np.empty((nC,1))

for i,C in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVC', LinearSVC(C=C, loss='hinge')),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin_h_l2[i] = time() - t0

    y_pred_train_lin = lin.predict(X_train)                      
    scores_train_lin_h_l2[i] = accuracy_score(y_train, y_pred_train_lin)

    y_pred_lin = lin.predict(X_test)                      
    scores_lin_h_l2[i] = accuracy_score(y_test, y_pred_lin)



















































































































































































































































































































































































































































































































































































































100%|██████████| 8/8 [00:28<00:00,  3.55s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


In [51]:
np.savetxt('synthetic_clf_results/scores_lin_h_l2.csv', scores_lin_h_l2, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_lin_h_l2.csv', scores_train_lin_h_l2, delimiter=',')
np.savetxt('synthetic_clf_results/times_lin_h_l2.csv', times_lin_h_l2, delimiter=',')

### Random Fourier Features Hinge Loss l2 penalty

In [98]:
scores_rff_h = np.empty((nC,M))
scores_train_rff_h = np.empty((nC,M))
times_rff_h = np.empty((nC,M))

for i,C in enumerate(tqdm(list_C)):
    for j in range(M):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = 2500)),
                        ('LinearSVC', LinearSVC(C = C, loss='hinge')),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_h[i,j] = time() - t0

        y_pred_train_rff = rff.predict(X_train)                      
        scores_train_rff_h[i,j] = accuracy_score(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        scores_rff_h[i,j] = accuracy_score(y_test, y_pred_rff)



  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [02:22<16:39, 142.73s/it][A[A

 25%|██▌       | 2/8 [07:31<19:15, 192.59s/it][A[A

 38%|███▊      | 3/8 [13:21<19:58, 239.76s/it][A[A

 50%|█████     | 4/8 [19:28<18:31, 277.86s/it][A[A

 62%|██████▎   | 5/8 [23:41<13:31, 270.59s/it][A[A

 75%|███████▌  | 6/8 [29:06<09:33, 286.79s/it][A[A

 88%|████████▊ | 7/8 [34:09<04:51, 291.73s/it][A[A

100%|██████████| 8/8 [38:46<00:00, 290.86s/it][A[A


### Random Fourier Features Squared Hinge Loss l2 penalty

In [99]:
scores_rff_sh = np.empty((nC,M))
scores_train_rff_sh = np.empty((nC,M))
times_rff_sh = np.empty((nC,M))

for i,C in enumerate(tqdm(list_C)):
    for j in range(M):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = 2500)),
                        ('LinearSVC', LinearSVC(dual=False, C = C)),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_sh[i,j] = time() - t0

        y_pred_train_rff = rff.predict(X_train)                      
        scores_train_rff_sh[i,j] = accuracy_score(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        scores_rff_sh[i,j] = accuracy_score(y_test, y_pred_rff)



  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [03:01<21:08, 181.27s/it][A[A

 25%|██▌       | 2/8 [08:05<21:49, 218.29s/it][A[A

 38%|███▊      | 3/8 [13:32<20:53, 250.65s/it][A[A

 50%|█████     | 4/8 [19:06<18:23, 275.81s/it][A[A

 62%|██████▎   | 5/8 [24:54<14:51, 297.31s/it][A[A

 75%|███████▌  | 6/8 [30:51<10:30, 315.40s/it][A[A

 88%|████████▊ | 7/8 [36:50<05:28, 328.44s/it][A[A

100%|██████████| 8/8 [43:00<00:00, 322.56s/it][A[A


In [100]:
np.savetxt('synthetic_clf_results/scores_rff_h.csv', scores_rff_h, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff_h.csv', scores_train_rff_h, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff_h.csv', times_rff_h, delimiter=',')
np.savetxt('synthetic_clf_results/scores_rff_sh.csv', scores_rff_sh, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff_sh.csv', scores_train_rff_sh, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff_sh.csv', times_rff_sh, delimiter=',')
np.savetxt('synthetic_clf_results/list_C.csv', list_C, delimiter=',')

### SGD Nystrom Classifier hinge loss l2 penalty

In [79]:
scores_nys_h = np.empty((nC,M))
scores_train_nys_h = np.empty((nC,M))
times_nys_h = np.empty((nC,M))

for i,C in enumerate(tqdm(list_C)):
    for j in range(M):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromClassifier(m=3000, lambda_reg=C, loss='hinge')),])
        
        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_h[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        scores_train_nys_h[i,j] = accuracy_score(y_train, y_pred_train_nys)
        
        y_pred_nys = nys.predict(X_test)                      
        scores_nys_h[i,j] = accuracy_score(y_test, y_pred_nys)

100%|██████████| 8/8 [17:27<00:00, 130.96s/it]


In [81]:
np.savetxt('synthetic_clf_results/scores_nys_h.csv', scores_nys_h, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_nys_h.csv', scores_train_nys_h, delimiter=',')
np.savetxt('synthetic_clf_results/times_nys_h.csv', times_nys_h, delimiter=',')

### SGD Nystrom Classifier squared hinge loss l2 penalty

In [83]:
scores_nys_sh = np.empty((nC,M))
scores_train_nys_sh = np.empty((nC,M))
times_nys_sh = np.empty((nC,M))

for i,C in enumerate(tqdm(list_C)):
    for j in range(M):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromClassifier(m=3000, lambda_reg=C, loss='squared_hinge')),])
        
        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_sh[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        scores_train_nys_sh[i,j] = accuracy_score(y_train, y_pred_train_nys)
        
        y_pred_nys = nys.predict(X_test)                      
        scores_nys_sh[i,j] = accuracy_score(y_test, y_pred_nys)




KeyboardInterrupt: 

#### Dump/save data

In [92]:
np.savetxt('synthetic_clf_results/scores_nys_sh.csv', scores_nys_sh, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_nys_sh.csv', scores_train_nys_sh, delimiter=',')
np.savetxt('synthetic_clf_results/times_nys_sh.csv', times_nys_sh, delimiter=',')

### Linear SVC l2 penalty

In [93]:
scores_lin = np.empty((nC,1))
scores_train_lin = np.empty((nC,1))
times_lin = np.empty((nC,1))

for i,C in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVC', LinearSVC(dual=False, C=C)),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin[i] = time() - t0

    y_pred_train_lin = lin.predict(X_train)                      
    scores_train_lin[i] = accuracy_score(y_train, y_pred_train_lin)

    y_pred_lin = lin.predict(X_test)                      
    scores_lin[i] = accuracy_score(y_test, y_pred_lin)



  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [00:00<00:04,  1.58it/s][A[A

 25%|██▌       | 2/8 [00:01<00:03,  1.67it/s][A[A

 38%|███▊      | 3/8 [00:01<00:02,  1.78it/s][A[A

 50%|█████     | 4/8 [00:02<00:02,  1.89it/s][A[A

 62%|██████▎   | 5/8 [00:02<00:01,  1.98it/s][A[A

 75%|███████▌  | 6/8 [00:02<00:00,  2.06it/s][A[A

 88%|████████▊ | 7/8 [00:03<00:00,  2.11it/s][A[A

100%|██████████| 8/8 [00:03<00:00,  2.03it/s][A[A


In [94]:
np.savetxt('synthetic_clf_results/scores_lin.csv', scores_lin, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_lin.csv', scores_train_lin, delimiter=',')
np.savetxt('synthetic_clf_results/times_lin.csv', times_lin, delimiter=',')

In [172]:
print("SVC with gaussian kernel execution time:", time_gauss)

SVC with gaussian kernel execution time: 242.52361536026


### Random Fourier Features squared hinge l1 penalty

In [177]:
scores_rff_l1 = np.empty((nC,M))
scores_train_rff_l1 = np.empty((nC,M))
times_rff_l1 = np.empty((nC,M))

for i,C in enumerate(list_C):
    for j in range(M):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = 2500)),
                        ('LinearSVC', LinearSVC(dual=False, C = C, penalty='l1')),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_l1[i,j] = time() - t0

        y_pred_train_rff = rff.predict(X_train)                      
        scores_train_rff_l1[i,j] = accuracy_score(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        scores_rff_l1[i,j] = accuracy_score(y_test, y_pred_rff)



KeyboardInterrupt: 

In [198]:
scores_rff_l2 = scores_rff_h
scores_train_rff_l2 = scores_train_rff_h
times_rff_l2 = times_rff_h

In [103]:
scores_nys_l2 = scores_nys_h
scores_train_nys_l2 = scores_train_nys_h
times_nys_l2 = times_nys_h

In [200]:
np.savetxt('synthetic_clf_results/scores_rff_l1.csv', scores_rff_l1, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff_l1.csv', scores_train_rff_l1, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff_l1.csv', times_rff_l1, delimiter=',')
np.savetxt('synthetic_clf_results/scores_rff_l2.csv', scores_rff_l2, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_rff_l2.csv', scores_train_rff_l2, delimiter=',')
np.savetxt('synthetic_clf_results/times_rff_l2.csv', times_rff_l2, delimiter=',')

### SGD Nystrom Classifier hinge loss l1 penalty

In [104]:
scores_nys_l1 = np.empty((nC,M))
scores_train_nys_l1 = np.empty((nC,M))
times_nys_l1 = np.empty((nC,M))

for i,C in enumerate(tqdm(list_C)):
    for j in range(M):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromClassifier(m=3000, lambda_reg=C, loss='hinge', penalty='l1')),])
        
        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_h[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        scores_train_nys_l1[i,j] = accuracy_score(y_train, y_pred_train_nys)
        
        y_pred_nys = nys.predict(X_test)                      
        scores_nys_l1[i,j] = accuracy_score(y_test, y_pred_nys)



  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [03:28<24:22, 208.92s/it][A[A

 25%|██▌       | 2/8 [06:56<20:50, 208.46s/it][A[A

 38%|███▊      | 3/8 [10:23<17:19, 207.97s/it][A[A

 50%|█████     | 4/8 [13:50<13:51, 207.86s/it][A[A

 62%|██████▎   | 5/8 [17:17<10:22, 207.53s/it][A[A

 75%|███████▌  | 6/8 [20:56<07:01, 210.91s/it][A[A

 88%|████████▊ | 7/8 [24:34<03:33, 213.05s/it][A[A

100%|██████████| 8/8 [28:09<00:00, 211.13s/it][A[A


In [105]:
np.savetxt('synthetic_clf_results/scores_nys_l1.csv', scores_nys_l1, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_nys_l1.csv', scores_train_nys_l1, delimiter=',')
np.savetxt('synthetic_clf_results/times_nys_l1.csv', times_nys_l1, delimiter=',')
np.savetxt('synthetic_clf_results/scores_nys_l2.csv', scores_nys_l2, delimiter=',')
np.savetxt('synthetic_clf_results/scores_train_nys_l2.csv', scores_train_nys_l2, delimiter=',')
np.savetxt('synthetic_clf_results/times_nys_l2.csv', times_nys_l2, delimiter=',')

# Experiments on the CT Slices dataset <a name="CT"></a>
## Data-loading/processing

In [132]:
data = pd.read_csv('data/slice_localization_data.zip')

In [133]:
data.shape

(53500, 386)

In [134]:
features = data.drop('reference', axis=1)

In [135]:
y = data.loc[:,'reference']

In [136]:
X_train, X_test, y_train, y_test = train_test_split(features, y, random_state=24)
X_train = X_train.values
X_test = X_test.values

## Baseline

In [140]:
lin = Pipeline([('Scaler',StandardScaler()),
                ('LinearSVR', LinearSVR(max_iter=10000)),])

t0 = time()
lin.fit(X_train, y_train)
time_lin_l1 = time() - t0

y_pred_train_lin = lin.predict(X_train)                      
mse_train_lin_l1 = mean_squared_error(y_train, y_pred_train_lin)
mae_train_lin_l1 = mean_absolute_error(y_train, y_pred_train_lin)

y_pred_lin = lin.predict(X_test)                      
mse_lin_l1 = mean_squared_error(y_test, y_pred_lin)
mae_lin_l1 = mean_absolute_error(y_test, y_pred_lin)

In [141]:
lin = Pipeline([('Scaler',StandardScaler()),
                ('LinearSVR', LinearSVR(max_iter=10000, loss='squared_epsilon_insensitive')),])

t0 = time()
lin.fit(X_train, y_train)
time_lin_l2 = time() - t0

y_pred_train_lin = lin.predict(X_train)                      
mse_train_lin_l2 = mean_squared_error(y_train, y_pred_train_lin)
mae_train_lin_l2 = mean_absolute_error(y_train, y_pred_train_lin)

y_pred_lin = lin.predict(X_test)                      
mse_lin_l2 = mean_squared_error(y_test, y_pred_lin)
mae_lin_l2 = mean_absolute_error(y_test, y_pred_lin)



In [151]:
gauss = Pipeline([('Scaler',StandardScaler()),
                  ('SVR', SVR(kernel='rbf')),])

t0 = time()
gauss.fit(X_train, y_train)
time_gauss = time() - t0

y_pred_train_gauss = gauss.predict(X_train)                      
mse_train_gauss = mean_squared_error(y_train, y_pred_train_gauss)
mae_train_gauss = mean_absolute_error(y_train, y_pred_train_gauss)

y_pred_gauss = gauss.predict(X_test)                      
mse_gauss = mean_squared_error(y_test, y_pred_gauss)
mae_gauss = mean_absolute_error(y_test, y_pred_gauss)

In [152]:
np.savetxt('CT_results/mse_lin_l1.csv', np.asarray([mse_lin_l1]), delimiter=',')
np.savetxt('CT_results/mse_train_lin_l1.csv', np.asarray([mse_train_lin_l1]), delimiter=',')
np.savetxt('CT_results/mae_lin_l1.csv', np.asarray([mae_lin_l1]), delimiter=',')
np.savetxt('CT_results/mae_train_lin_l1.csv', np.asarray([mae_train_lin_l1]), delimiter=',')
np.savetxt('CT_results/time_lin_l1.csv', np.asarray([time_lin_l1]), delimiter=',')
np.savetxt('CT_results/mse_lin_l2.csv', np.asarray([mse_lin_l2]), delimiter=',')
np.savetxt('CT_results/mse_train_lin_l2.csv', np.asarray([mse_train_lin_l2]), delimiter=',')
np.savetxt('CT_results/mae_lin_l2.csv', np.asarray([mae_lin_l2]), delimiter=',')
np.savetxt('CT_results/mae_train_lin_l2.csv', np.asarray([mae_train_lin_l2]), delimiter=',')
np.savetxt('CT_results/time_lin_l2.csv', np.asarray([time_lin_l2]), delimiter=',')
np.savetxt('CT_results/mse_gauss.csv', np.asarray([mse_gauss]), delimiter=',')
np.savetxt('CT_results/mse_train_gauss.csv', np.asarray([mse_train_gauss]), delimiter=',')
np.savetxt('CT_results/mae_gauss.csv', np.asarray([mae_gauss]), delimiter=',')
np.savetxt('CT_results/mae_train_gauss.csv', np.asarray([mae_train_gauss]), delimiter=',')
np.savetxt('CT_results/time_gauss.csv', np.asarray([time_gauss]), delimiter=',')

## Evaluation of performances according to dimension (dimension and subsampling size) <a name="synt_D"></a>

In [153]:
nD = 10
list_D = np.linspace(300,3500,nD,dtype=int)
nb_iter = 5

In [154]:
mse_rff_l1 = np.empty((nD,nb_iter))
mse_train_rff_l1 = np.empty((nD,nb_iter))
mae_rff_l1 = np.empty((nD,nb_iter))
mae_train_rff_l1 = np.empty((nD,nb_iter))
times_rff_l1 = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = D)),
                        ('LinearSVR', LinearSVR(max_iter=10000)),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_l1[i,j] = time() - t0
        
        y_pred_train_rff = rff.predict(X_train)                      
        mse_train_rff_l1[i,j] = mean_squared_error(y_train, y_pred_train_rff)
        mae_train_rff_l1[i,j] = mean_absolute_error(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        mse_rff_l1[i,j] = mean_squared_error(y_test, y_pred_rff)
        mae_rff_l1[i,j] = mean_absolute_error(y_test, y_pred_rff)



  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [00:16<02:30, 16.72s/it][A[A

 20%|██        | 2/10 [00:45<02:42, 20.27s/it][A[A

 30%|███       | 3/10 [01:20<02:53, 24.79s/it][A[A

 40%|████      | 4/10 [02:11<03:16, 32.73s/it][A[A

 50%|█████     | 5/10 [03:10<03:22, 40.56s/it][A[A

 60%|██████    | 6/10 [04:21<03:18, 49.65s/it][A[A

 70%|███████   | 7/10 [05:54<03:08, 62.73s/it][A[A

 80%|████████  | 8/10 [08:54<03:15, 97.78s/it][A[A

 90%|█████████ | 9/10 [12:22<02:10, 130.78s/it][A[A

100%|██████████| 10/10 [35:42<00:00, 214.22s/it][A[A


In [155]:
np.savetxt('CT_results/mse_rff_l1.csv', mse_rff_l1, delimiter=',')
np.savetxt('CT_results/mse_train_rff_l1.csv', mse_train_rff_l1, delimiter=',')
np.savetxt('CT_results/mae_rff_l1.csv', mae_rff_l1, delimiter=',')
np.savetxt('CT_results/mae_train_rff_l1.csv', mae_train_rff_l1, delimiter=',')
np.savetxt('CT_results/times_rff_l1.csv', times_rff_l1, delimiter=',')
np.savetxt('CT_results/list_D.csv', list_D, delimiter=',')

### Random Fourier Features 

In [156]:
mse_rff_l2 = np.empty((nD,nb_iter))
mse_train_rff_l2 = np.empty((nD,nb_iter))
mae_rff_l2 = np.empty((nD,nb_iter))
mae_train_rff_l2 = np.empty((nD,nb_iter))
times_rff_l2 = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = D)),
                        ('LinearSVR', LinearSVR(max_iter=10000, loss='squared_epsilon_insensitive')),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_l2[i,j] = time() - t0
        
        y_pred_train_rff = rff.predict(X_train)                      
        mse_train_rff_l2[i,j] = mean_squared_error(y_train, y_pred_train_rff)
        mae_train_rff_l2[i,j] = mean_absolute_error(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        mse_rff_l2[i,j] = mean_squared_error(y_test, y_pred_rff)
        mae_rff_l2[i,j] = mean_absolute_error(y_test, y_pred_rff)



  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [00:28<04:14, 28.23s/it][A[A

 20%|██        | 2/10 [01:17<04:37, 34.68s/it][A[A

 30%|███       | 3/10 [02:32<05:26, 46.65s/it][A[A

 40%|████      | 4/10 [04:06<06:05, 60.93s/it][A[A

 50%|█████     | 5/10 [06:23<06:58, 83.66s/it][A[A

 60%|██████    | 6/10 [08:47<06:47, 101.91s/it][A[A

 70%|███████   | 7/10 [11:55<06:22, 127.61s/it][A[A

 80%|████████  | 8/10 [32:16<15:11, 455.72s/it][A[A

 90%|█████████ | 9/10 [48:14<10:06, 606.21s/it][A[A

100%|██████████| 10/10 [2:15:59<00:00, 815.98s/it] [A[A


In [157]:
np.savetxt('CT_results/mse_rff_l2.csv', mse_rff_l2, delimiter=',')
np.savetxt('CT_results/mse_train_rff_l2.csv', mse_train_rff_l2, delimiter=',')
np.savetxt('CT_results/mae_rff_l2.csv', mae_rff_l2, delimiter=',')
np.savetxt('CT_results/mae_train_rff_l2.csv', mae_train_rff_l2, delimiter=',')
np.savetxt('CT_results/times_rff_l2.csv', times_rff_l2, delimiter=',')
np.savetxt('CT_results/list_D.csv', list_D, delimiter=',')

### FALKON

In [6]:
mse_falkon = np.empty((nD,nb_iter))
mse_train_falkon = np.empty((nD,nb_iter))
mae_falkon = np.empty((nD,nb_iter))
mae_train_falkon = np.empty((nD,nb_iter))
times_falkon = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        falkon = Pipeline([('Scaler',StandardScaler()),
                           ('FALKON', FALKON(m=D)),])
        
        t0 = time()
        falkon.fit(X_train, y_train)
        times_falkon[i,j] = time() - t0
        
        y_pred_train_falkon = falkon.predict(X_train)                      
        mse_train_falkon[i,j] = mean_squared_error(y_train, y_pred_train_falkon)
        mae_train_falkon[i,j] = mean_absolute_error(y_train, y_pred_train_falkon)
        
        y_pred_falkon = falkon.predict(X_test)                      
        mse_falkon[i,j] = mean_squared_error(y_test, y_pred_falkon)
        mae_falkon[i,j] = mean_absolute_error(y_test, y_pred_falkon)

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  0%|          | 0/100 [00:00<?, ?it/s, improvement=0.00853][A
  1%|          | 1/100 [00:00<00:24,  4.08it/s, improvement=0.00853][A
  1%|          | 1/100 [00:00<00:24,  4.08it/s, improvement=0.02]   [A
  2%|▏         | 2/100 [00:00<00:24,  3.99it/s, improvement=0.02][A
  2%|▏         | 2/100 [00:00<00:24,  3.99it/s, improvement=0.0311][A
  3%|▎         | 3/100 [00:00<00:24,  3.98it/s, improvement=0.0311][A
  3%|▎         | 3/100 [00:01<00:24,  3.98it/s, improvement=0.105] [A
  4%|▍         | 4/100 [00:01<00:24,  3.84it/s, improvement=0.105][A
  4%|▍         | 4/100 [00:01<00:24,  3.84it/s, improvement=0.691][A
  5%|▌         | 5/100 [00:01<00:25,  3.73it/s, improvement=0.691][A
  5%|▌         | 5/100 [00:01<00:25,  3.73it/s, improvement=0.199][A
  6%|▌         | 6/100 [00:01<00:25,  3.75it/s, improvement=0.199][A
  6%|▌         | 6/100 [00:01<00:25,  3.75it/s, improvement=0.000143][A
  7%|

In [None]:
np.savetxt('CT_results/mse_falkon.csv', mse_falkon, delimiter=',')
np.savetxt('CT_results/mse_train_falkon.csv', mse_train_falkon, delimiter=',')
np.savetxt('CT_results/mae_falkon.csv', mae_falkon, delimiter=',')
np.savetxt('CT_results/mae_train_falkon.csv', mae_train_falkon, delimiter=',')
np.savetxt('CT_results/times_falkon.csv', times_falkon, delimiter=',')

### Closed form Nystrom

In [158]:
mse_nys_plain = np.empty((nD,nb_iter))
mse_train_nys_plain = np.empty((nD,nb_iter))
mae_nys_plain = np.empty((nD,nb_iter))
mae_train_nys_plain = np.empty((nD,nb_iter))
times_nys_plain = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', PlainNystromRegressor(m=D)),])
        
        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_plain[i,j] = time() - t0
        
        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_plain[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_plain[i,j] = mean_absolute_error(y_train, y_pred_train_nys)
        
        y_pred_nys = nys.predict(X_test)                      
        mse_nys_plain[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_plain[i,j] = mean_absolute_error(y_test, y_pred_nys)



  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [00:03<00:35,  3.99s/it][A[A

 20%|██        | 2/10 [00:12<00:42,  5.26s/it][A[A

 30%|███       | 3/10 [00:26<00:54,  7.83s/it][A[A

 40%|████      | 4/10 [00:52<01:20, 13.35s/it][A[A

 50%|█████     | 5/10 [01:27<01:39, 19.85s/it][A[A

 60%|██████    | 6/10 [02:09<01:46, 26.68s/it][A[A

 70%|███████   | 7/10 [03:07<01:48, 36.07s/it][A[A

 80%|████████  | 8/10 [04:24<01:36, 48.13s/it][A[A

 90%|█████████ | 9/10 [06:03<01:03, 63.36s/it][A[A

100%|██████████| 10/10 [08:00<00:00, 48.10s/it][A[A


In [159]:
np.savetxt('CT_results/mse_nys_plain.csv', mse_nys_plain, delimiter=',')
np.savetxt('CT_results/mse_train_nys_plain.csv', mse_train_nys_plain, delimiter=',')
np.savetxt('CT_results/mae_nys_plain.csv', mae_nys_plain, delimiter=',')
np.savetxt('CT_results/mae_train_nys_plain.csv', mae_train_nys_plain, delimiter=',')
np.savetxt('CT_results/times_nys_plain.csv', times_nys_plain, delimiter=',')

### SGD Nystrom l2 loss

In [None]:
mse_nys_l2 = np.empty((nD,nb_iter))
mse_train_nys_l2 = np.empty((nD,nb_iter))
mae_nys_l2 = np.empty((nD,nb_iter))
mae_train_nys_l2 = np.empty((nD,nb_iter))
times_nys_l2 = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=D)),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l2[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l2[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l2[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l2[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l2[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [3]:
np.savetxt('CT_results/mse_nys_l2.csv', mse_nys_l2, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l2.csv', mse_train_nys_l2, delimiter=',')
np.savetxt('CT_results/mae_nys_l2.csv', mae_nys_l2, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l2.csv', mae_train_nys_l2, delimiter=',')
np.savetxt('CT_results/times_nys_l2.csv', times_nys_l2, delimiter=',')

NameError: name 'np' is not defined

### SGD Nystrom l1 loss

In [None]:
mse_nys_l1 = np.empty((nD,nb_iter))
mse_train_nys_l1 = np.empty((nD,nb_iter))
mae_nys_l1 = np.empty((nD,nb_iter))
mae_train_nys_l1 = np.empty((nD,nb_iter))
times_nys_l1 = np.empty((nD,nb_iter))

for i,D in enumerate(tqdm(list_D)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=D, loss='epsilon_insensitive')),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l1[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l1[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l1[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l1[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l1[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_nys_l1.csv', mse_nys_l1, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l1.csv', mse_train_nys_l1, delimiter=',')
np.savetxt('CT_results/mae_nys_l1.csv', mae_nys_l1, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l1.csv', mae_train_nys_l1, delimiter=',')
np.savetxt('CT_results/times_nys_l1.csv', times_nys_l1, delimiter=',')

In [40]:
print("SVR with gaussian kernel execution time:", time_gauss)

SVR with gaussian kernel execution time: 1274.2267017364502


## Influence of the regularization $\lambda$  <a name="CT_reg"></a>

In [121]:
eC_min = -6
eC_max = 1
list_C = [10**i for i in range(eC_min, eC_max+1)]
nC = eC_max-eC_min+1
M = 10

### Linear SVR l1 loss

In [None]:
mse_lin_l1_reg = np.empty((nC,1))
mse_train_lin_l1_reg = np.empty((nC,1))
mae_lin_l1_reg = np.empty((nC,1))
mae_train_lin_l1_reg = np.empty((nC,1))
times_lin_l1_reg = np.empty((nC,1))

for i,D in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVR', LinearSVR(max_iter=10000, C=C)),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin_l1_reg[i] = time() - t0

    y_pred_train_nys = lin.predict(X_train)                      
    mse_train_lin_l1_reg[i] = mean_squared_error(y_train, y_pred_train_nys)
    mae_train_lin_l1_reg[i] = mean_absolute_error(y_train, y_pred_train_nys)

    y_pred_nys = lin.predict(X_test)                      
    mse_lin_l1_reg[i] = mean_squared_error(y_test, y_pred_nys)
    mae_lin_l1_reg[i] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_lin_l1_reg.csv', mse_lin_l1_reg, delimiter=',')
np.savetxt('CT_results/mse_train_lin_l1_reg.csv', mse_train_lin_l1_reg, delimiter=',')
np.savetxt('CT_results/mae_lin_l1_reg.csv', mae_lin_l1_reg, delimiter=',')
np.savetxt('CT_results/mae_train_lin_l1_reg.csv', mae_train_lin_l1_reg, delimiter=',')
np.savetxt('CT_results/times_lin_l1_reg.csv', times_lin_l1_reg, delimiter=',')

### Linear SVR l2 loss

In [None]:
mse_lin_l2_reg = np.empty((nC,1))
mse_train_lin_l2_reg = np.empty((nC,1))
mae_lin_l2_reg = np.empty((nC,1))
mae_train_lin_l2_reg = np.empty((nC,1))
times_lin_l2_reg = np.empty((nC,1))

for i,C in enumerate(tqdm(list_C)):
    lin = Pipeline([('Scaler',StandardScaler()),
                    ('LinearSVR', LinearSVR(max_iter=10000, loss='squared_epsilon_insensitive', C=C)),])

    t0 = time()
    lin.fit(X_train, y_train)
    times_lin_l2_reg[i] = time() - t0

    y_pred_train_nys = lin.predict(X_train)                      
    mse_train_lin_l2_reg[i] = mean_squared_error(y_train, y_pred_train_nys)
    mae_train_lin_l2_reg[i] = mean_absolute_error(y_train, y_pred_train_nys)

    y_pred_nys = lin.predict(X_test)                      
    mse_lin_l2_reg[i] = mean_squared_error(y_test, y_pred_nys)
    mae_lin_l2_reg[i] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_lin_l2_reg.csv', mse_lin_l2_reg, delimiter=',')
np.savetxt('CT_results/mse_train_lin_l2_reg.csv', mse_train_lin_l2_reg, delimiter=',')
np.savetxt('CT_results/mae_lin_l2_reg.csv', mae_lin_l2_reg, delimiter=',')
np.savetxt('CT_results/mae_train_lin_l2_reg.csv', mae_train_lin_l2_reg, delimiter=',')
np.savetxt('CT_results/times_lin_l2_reg.csv', times_lin_l2_reg, delimiter=',')

### Random Fourier Features loss epsilon sensitive

In [42]:
mse_rff_l1 = np.empty((nC,nb_iter))
mse_train_rff_l1 = np.empty((nC,nb_iter))
mae_rff_l1 = np.empty((nC,nb_iter))
mae_train_rff_l1 = np.empty((nC,nb_iter))
times_rff_l1 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = 2000)),
                        ('LinearSVR', LinearSVR(C = C, max_iter=10000)),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_l1[i,j] = time() - t0
        
        y_pred_train_rff = rff.predict(X_train)                      
        mse_train_rff_l1[i,j] = mean_squared_error(y_train, y_pred_train_rff)
        mae_train_rff_l1[i,j] = mean_absolute_error(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        mse_rff_l1[i,j] = mean_squared_error(y_test, y_pred_rff)
        mae_rff_l1[i,j] = mean_absolute_error(y_test, y_pred_rff)

100%|██████████| 8/8 [23:39<00:00, 177.40s/it]


In [None]:
np.savetxt('CT_results/mse_rff_l1_reg.csv', mse_rff_l1, delimiter=',')
np.savetxt('CT_results/mse_train_rff_l1_reg.csv', mse_train_rff_l1, delimiter=',')
np.savetxt('CT_results/mae_rff_l1_reg.csv', mae_rff_l1, delimiter=',')
np.savetxt('CT_results/mae_train_rff_l1_reg.csv', mae_train_rff_l1, delimiter=',')
np.savetxt('CT_results/times_rff_l1_reg.csv', times_rff_l1, delimiter=',')

### Random Fourier Features loss squared epsilon sensitive

In [43]:
mse_rff_l2 = np.empty((nC,nb_iter))
mse_train_rff_l2 = np.empty((nC,nb_iter))
mae_rff_l2 = np.empty((nC,nb_iter))
mae_train_rff_l2 = np.empty((nC,nb_iter))
times_rff_l2 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        rff = Pipeline([('Scaler',StandardScaler()),
                        ('RFF',RFF(D = 2000)),
                        ('LinearSVR', LinearSVR(C = C, loss='squared_epsilon_insensitive', max_iter=10000)),])
        
        t0 = time()
        rff.fit(X_train, y_train)
        times_rff_l2[i,j] = time() - t0
        
        y_pred_train_rff = rff.predict(X_train)                      
        mse_train_rff_l2[i,j] = mean_squared_error(y_train, y_pred_train_rff)
        mae_train_rff_l2[i,j] = mean_absolute_error(y_train, y_pred_train_rff)
        
        y_pred_rff = rff.predict(X_test)                      
        mse_rff_l2[i,j] = mean_squared_error(y_test, y_pred_rff)
        mae_rff_l2[i,j] = mean_absolute_error(y_test, y_pred_rff)

100%|██████████| 8/8 [40:19<00:00, 302.40s/it]


In [44]:
np.savetxt('CT_results/mse_rff_l2_reg.csv', mse_rff_l2, delimiter=',')
np.savetxt('CT_results/mse_train_rff_l2_reg.csv', mse_train_rff_l2, delimiter=',')
np.savetxt('CT_results/mae_rff_l2_reg.csv', mae_rff_l2, delimiter=',')
np.savetxt('CT_results/mae_train_rff_l2_reg.csv', mae_train_rff_l2, delimiter=',')
np.savetxt('CT_results/times_rff_l2_reg.csv', times_rff_l2, delimiter=',')

### SGD Nystrom l2 loss l1 penalty

In [None]:
mse_nys_l2_reg_l1 = np.empty((nC,nb_iter))
mse_train_nys_l2_reg_l1 = np.empty((nC,nb_iter))
mae_nys_l2_reg_l1 = np.empty((nC,nb_iter))
mae_train_nys_l2_reg_l1 = np.empty((nC,nb_iter))
times_nys_l2_reg_l1 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=1000, lambda_reg=C, penalty='l1')),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l2_reg_l1[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l2_reg_l1[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l2_reg_l1[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l2_reg_l1[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l2_reg_l1[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_nys_l2_reg_l1.csv', mse_nys_l2_reg_l1, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l2_reg_l1.csv', mse_train_nys_l2_reg_l1, delimiter=',')
np.savetxt('CT_results/mae_nys_l2_reg_l1.csv', mae_nys_l2_reg_l1, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l2_reg_l1.csv', mae_train_nys_l2_reg_l1, delimiter=',')
np.savetxt('CT_results/times_nys_l2_reg_l1.csv', times_nys_l2_reg_l1, delimiter=',')

### SGD Nystrom l1 loss l1 penalty

In [None]:
mse_nys_l1_reg_l1 = np.empty((nC,nb_iter))
mse_train_nys_l1_reg_l1 = np.empty((nC,nb_iter))
mae_nys_l1_reg_l1 = np.empty((nC,nb_iter))
mae_train_nys_l1_reg_l1 = np.empty((nC,nb_iter))
times_nys_l1_reg_l1 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=1000, lambda_reg=C, penalty='l1', loss='epsilon_insensitive')),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l1_reg_l1[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l1_reg_l1[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l1_reg_l1[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l1_reg_l1[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l1_reg_l1[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_nys_l1_reg_l1.csv', mse_nys_l1_reg_l1, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l1_reg_l1.csv', mse_train_nys_l1_reg_l1, delimiter=',')
np.savetxt('CT_results/mae_nys_l1_reg_l1.csv', mae_nys_l1_reg_l1, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l1_reg_l1.csv', mae_train_nys_l1_reg_l1, delimiter=',')
np.savetxt('CT_results/times_nys_l1_reg_l1.csv', times_nys_l1_reg_l1, delimiter=',')

### SGD Nystrom l1 loss l2 penalty

In [None]:
mse_nys_l1_reg_l2 = np.empty((nC,nb_iter))
mse_train_nys_l1_reg_l2 = np.empty((nC,nb_iter))
mae_nys_l1_reg_l2 = np.empty((nC,nb_iter))
mae_train_nys_l1_reg_l2 = np.empty((nC,nb_iter))
times_nys_l1_reg_l2 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=1000, lambda_reg=C, loss='epsilon_insensitive')),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l1_reg_l2[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l1_reg_l2[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l1_reg_l2[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l1_reg_l2[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l1_reg_l2[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_nys_l1_reg_l2.csv', mse_nys_l1_reg_l2, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l1_reg_l2.csv', mse_train_nys_l1_reg_l2, delimiter=',')
np.savetxt('CT_results/mae_nys_l1_reg_l2.csv', mae_nys_l1_reg_l2, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l1_reg_l2.csv', mae_train_nys_l1_reg_l2, delimiter=',')
np.savetxt('CT_results/times_nys_l1_reg_l2.csv', times_nys_l1_reg_l2, delimiter=',')

### SGD Nystrom l2 loss l2 penalty

In [None]:
mse_nys_l2_reg_l2 = np.empty((nC,nb_iter))
mse_train_nys_l2_reg_l2 = np.empty((nC,nb_iter))
mae_nys_l2_reg_l2 = np.empty((nC,nb_iter))
mae_train_nys_l2_reg_l2 = np.empty((nC,nb_iter))
times_nys_l2_reg_l2 = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', SGDPlainNystromRegressor(m=1000, lambda_reg=C)),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_l2_reg_l2[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_l2_reg_l2[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_l2_reg_l2[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_l2_reg_l2[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_l2_reg_l2[i,j] = mean_absolute_error(y_test, y_pred_nys)

In [None]:
np.savetxt('CT_results/mse_nys_l2_reg_l2.csv', mse_nys_l2_reg_l2, delimiter=',')
np.savetxt('CT_results/mse_train_nys_l2_reg_l2.csv', mse_train_nys_l2_reg_l2, delimiter=',')
np.savetxt('CT_results/mae_nys_l2_reg_l2.csv', mae_nys_l2_reg_l2, delimiter=',')
np.savetxt('CT_results/mae_train_nys_l2_reg_l2.csv', mae_train_nys_l2_reg_l2, delimiter=',')
np.savetxt('CT_results/times_nys_l2_reg_l2.csv', times_nys_l2_reg_l2, delimiter=',')

### Nystrom closed form

In [123]:
mse_nys_plain_reg = np.empty((nC,nb_iter))
mse_train_nys_plain_reg = np.empty((nC,nb_iter))
mae_nys_plain_reg = np.empty((nC,nb_iter))
mae_train_nys_plain_reg = np.empty((nC,nb_iter))
times_nys_plain_reg = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        nys = Pipeline([('Scaler',StandardScaler()),
                        ('nys', PlainNystromRegressor(m=3500, lambda_reg=C)),])

        t0 = time()
        nys.fit(X_train, y_train)
        times_nys_plain_reg[i,j] = time() - t0

        y_pred_train_nys = nys.predict(X_train)                      
        mse_train_nys_plain_reg[i,j] = mean_squared_error(y_train, y_pred_train_nys)
        mae_train_nys_plain_reg[i,j] = mean_absolute_error(y_train, y_pred_train_nys)

        y_pred_nys = nys.predict(X_test)                      
        mse_nys_plain_reg[i,j] = mean_squared_error(y_test, y_pred_nys)
        mae_nys_plain_reg[i,j] = mean_absolute_error(y_test, y_pred_nys)



  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [04:29<31:23, 269.05s/it][A[A

 25%|██▌       | 2/8 [09:03<27:04, 270.78s/it][A[A

 38%|███▊      | 3/8 [13:38<22:39, 271.88s/it][A[A

 50%|█████     | 4/8 [17:57<17:51, 267.97s/it][A[A

 62%|██████▎   | 5/8 [22:29<13:27, 269.25s/it][A[A

 75%|███████▌  | 6/8 [26:54<08:56, 268.07s/it][A[A

 88%|████████▊ | 7/8 [31:39<04:32, 272.96s/it][A[A

100%|██████████| 8/8 [36:34<00:00, 274.35s/it][A[A


In [131]:
np.savetxt('CT_results/mse_nys_plain_reg.csv', mse_nys_plain_reg, delimiter=',')
np.savetxt('CT_results/mse_train_nys_plain_reg.csv', mse_train_nys_plain_reg, delimiter=',')
np.savetxt('CT_results/mae_nys_plain_reg.csv', mae_nys_plain_reg, delimiter=',')
np.savetxt('CT_results/mae_train_nys_plain_reg.csv', mae_train_nys_plain_reg, delimiter=',')
np.savetxt('CT_results/times_nys_plain_reg.csv', times_nys_plain_reg, delimiter=',')

### FALKON

In [None]:
mse_falkon_reg = np.empty((nC,nb_iter))
mse_train_falkon_reg = np.empty((nC,nb_iter))
mae_falkon_reg = np.empty((nC,nb_iter))
mae_train_falkon_reg = np.empty((nC,nb_iter))
times_falkon_reg = np.empty((nC,nb_iter))

for i,C in enumerate(tqdm(list_C)):
    for j in range(nb_iter):
        falkon = Pipeline([('Scaler',StandardScaler()),
                           ('FALKON', FALKON(m=300, lambda_reg=C)),])

        t0 = time()
        falkon.fit(X_train, y_train)
        times_falkon_reg[i,j] = time() - t0
        
        y_pred_train_falkon = falkon.predict(X_train)                      
        mse_train_falkon_reg[i,j] = mean_squared_error(y_train, y_pred_train_falkon)
        mae_train_falkon_reg[i,j] = mean_absolute_error(y_train, y_pred_train_falkon)
        
        y_pred_falkon = falkon.predict(X_test)                      
        mse_falkon_reg[i,j] = mean_squared_error(y_test, y_pred_falkon)
        mae_falkon_reg[i,j] = mean_absolute_error(y_test, y_pred_falkon)

In [None]:
np.savetxt('CT_results/mse_falkon_reg.csv', mse_falkon_reg, delimiter=',')
np.savetxt('CT_results/mse_train_falkon_reg.csv', mse_train_falkon_reg, delimiter=',')
np.savetxt('CT_results/mae_falkon_reg.csv', mae_falkon_reg, delimiter=',')
np.savetxt('CT_results/mae_train_falkon_reg.csv', mae_train_falkon_reg, delimiter=',')
np.savetxt('CT_results/times_falkon_reg.csv', times_falkon_reg, delimiter=',')

In [18]:
print("SVR with gaussian kernel execution time:", time_gauss)

SVR with gaussian kernel execution time: 1274.2267017364502
