# Regression with synthetic data

In [1]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
X, y = make_regression(n_samples=20000, n_features=120, n_informative=50)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
%%time
reg = LinearSVR()
reg.fit(X_train, y_train)

CPU times: user 31.6 ms, sys: 40.9 ms, total: 72.4 ms
Wall time: 108 ms


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [7]:
pred = reg.predict(X_test)
mean_squared_error(y_test,pred)

8.262968008243279e-22

In [8]:
%%time
reg_rbf = SVR(kernel='rbf')
reg_rbf.fit(X_train, y_train)

CPU times: user 35.9 s, sys: 353 ms, total: 36.3 s
Wall time: 36.1 s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [9]:
pred_rbf = reg_rbf.predict(X_test)
mean_squared_error(y_test,pred_rbf)

144545.83474744283

In [10]:
%%time
d = X.shape[1]
D = 300
mu = np.zeros(d)
sigma = (1/d)*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 1.2 s, sys: 249 ms, total: 1.45 s
Wall time: 593 ms


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [11]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

136825.5451915388

In [12]:
%%time
d = X.shape[1]
D = 300
mu = np.zeros(d)
sigma = (1/d)*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
b = np.random.uniform(0, 2*np.pi, D)
Z_train = np.sqrt(2/D) * np.cos(np.dot(X_train, w.T) + b)
Z_test = np.sqrt(2/D) * np.cos(np.dot(X_test, w.T) + b)
reg_rff_cos = LinearSVR()
reg_rff_cos.fit(Z_train, y_train)

CPU times: user 980 ms, sys: 96.2 ms, total: 1.08 s
Wall time: 321 ms


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [13]:
pred_rff_cos = reg_rff_cos.predict(Z_test)
mean_squared_error(y_test,pred_rff_cos)

140188.5862421703

# Classification with synthetic data

In [14]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

In [15]:
X, y = make_classification(n_samples=20000, n_features=120, n_informative=50, n_classes=5)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
%%time
clf = LinearSVC(dual=False)
clf.fit(X_train, y_train)

CPU times: user 743 ms, sys: 18.5 ms, total: 761 ms
Wall time: 775 ms


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [32]:
pred = clf.predict(X_test)
accuracy_score(y_test,pred)

0.582

In [19]:
%%time
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train, y_train)

CPU times: user 43.5 s, sys: 501 ms, total: 44 s
Wall time: 44.3 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
pred_rbf = clf_rbf.predict(X_test)
accuracy_score(y_test,pred_rbf)

0.8668

In [33]:
%%time
d = X.shape[1]
D = 1000
mu = np.zeros(d)
sigma = (1/d)*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
clf_rff = LinearSVC(dual=False)
clf_rff.fit(Z_train, y_train)

CPU times: user 21.8 s, sys: 621 ms, total: 22.4 s
Wall time: 20.5 s


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [34]:
pred_rff = clf_rff.predict(Z_test)
accuracy_score(y_test,pred_rff)

0.706

In [35]:
%%time
d = X.shape[1]
D = 1000
mu = np.zeros(d)
sigma = (1/d)*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
b = np.random.uniform(0, 2*np.pi, D)
Z_train = np.sqrt(2/D) * np.cos(np.dot(X_train, w.T) + b)
Z_test = np.sqrt(2/D) * np.cos(np.dot(X_test, w.T) + b)
clf_rff_cos = LinearSVC(dual=False)
clf_rff_cos.fit(Z_train, y_train)

CPU times: user 11.3 s, sys: 236 ms, total: 11.6 s
Wall time: 10.3 s


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [36]:
pred_rff_cos = clf_rff_cos.predict(Z_test)
accuracy_score(y_test,pred_rff_cos)

0.6566