# Regression with synthetic data

In [1]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
X, y = make_regression(n_samples=20000, n_features=120, n_informative=50)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
%%time
reg = SVR(kernel='linear')
reg.fit(X_train, y_train)

CPU times: user 10.6 s, sys: 84.7 ms, total: 10.7 s
Wall time: 10.4 s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [8]:
pred = reg.predict(X_test)
mean_squared_error(y_test,pred)

0.001679080419729052

In [6]:
%%time
reg_rbf = SVR(kernel='rbf')
reg_rbf.fit(X_train, y_train)

CPU times: user 35.2 s, sys: 934 ms, total: 36.1 s
Wall time: 38.2 s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [9]:
pred_rbf = reg_rbf.predict(X_test)
mean_squared_error(y_test,pred_rbf)

149458.489090448

In [10]:
%%time
d = X.shape[1]
D = 30
mu = np.zeros(d)
sigma = np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = SVR(kernel='linear')
reg_rff.fit(Z_train, y_train)

CPU times: user 14.8 s, sys: 502 ms, total: 15.3 s
Wall time: 15.7 s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [11]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

171391.098899383

# Classification with synthetic data

In [12]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [13]:
X, y = make_classification(n_samples=20000, n_features=120, n_informative=50, n_classes=5)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
%%time
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

CPU times: user 1min 50s, sys: 868 ms, total: 1min 51s
Wall time: 1min 52s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [16]:
pred = clf.predict(X_test)
accuracy_score(y_test,pred)

0.6298

In [17]:
%%time
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train, y_train)

CPU times: user 43.3 s, sys: 417 ms, total: 43.7 s
Wall time: 43.9 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
pred_rbf = clf_rbf.predict(X_test)
accuracy_score(y_test,pred_rbf)

0.8636

In [19]:
%%time
d = X.shape[1]
D = 30
mu = np.zeros(d)
sigma = np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
clf_rff = SVC(kernel='linear')
clf_rff.fit(Z_train, y_train)

CPU times: user 29.7 s, sys: 566 ms, total: 30.2 s
Wall time: 30.1 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
pred_rff = clf_rff.predict(Z_test)
accuracy_score(y_test,pred_rff)

0.1932