# Regression with synthetic data

## Imports

In [1]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
np.random.seed(42)

## Data Generation

In [2]:
X, y = make_regression(n_samples=30000, n_features=120)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Baselines
### Linear

In [7]:
%%time
reg = LinearSVR(dual=True)
reg.fit(X_train, y_train)

Wall time: 68 ms


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [8]:
pred = reg.predict(X_test)
mean_squared_error(y_test,pred)

9.005241654148815e-22

### Radial Basis Function

In [10]:
%%time
reg_rbf = SVR(kernel='rbf')
reg_rbf.fit(X_train, y_train)

Wall time: 1min 4s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [11]:
pred_rbf = reg_rbf.predict(X_test)
mean_squared_error(y_test,pred_rbf)

22065.740532108248

## Nystrom

In [13]:
%%time
from rkhs.nystrom import PlainNystrom
nys = PlainNystrom(m=1000)
Z_train = nys.fit_transform(X_train)
Z_test = nys.transform(X_test)

Wall time: 725 ms


In [14]:
reg_nys = LinearSVR(dual=True)
reg_nys.fit(Z_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [15]:
pred_nys = reg_nys.predict(Z_test)
mean_squared_error(y_test, pred_nys)

35873.47836009312

# Classification with synthetic data
## Imports

In [16]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

## Data Generation

In [17]:
X, y = make_classification(n_samples=20000, n_features=120, n_informative=50, n_classes=5)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Baselines
### Linear

In [19]:
%%time
clf = LinearSVC(dual=X_train.shape[0] <= X_train.shape[1])
clf.fit(X_train, y_train)

Wall time: 700 ms


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [20]:
pred = clf.predict(X_test)
accuracy_score(y_test,pred)

0.576

### Radial Basis Function

In [23]:
%%time
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train, y_train)

Wall time: 41.1 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
pred_rbf = clf_rbf.predict(X_test)
accuracy_score(y_test,pred_rbf)

0.861

## Nystrom

In [25]:
%%time
from rkhs.nystrom import PlainNystrom
nys = PlainNystrom(m=30000)
Z_train = nys.fit_transform(X_train)
Z_test = nys.transform(X_test)

Wall time: 7.54 s


In [None]:
%%time
clf_nys = LinearSVC(dual= Z_train.shape[0] <= Z_train.shape[1])
clf_nys.fit(Z_train, y_train)

In [None]:
pred_nys = clf_nys.predict(Z_test)
accuracy_score(y_test, pred_nys)