In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [7]:
from google_drive_downloader import GoogleDriveDownloader as gdd
import os

DATA = {
    "YearPredictionMSD.txt.zip": "1R3CXYssjftxi7HIXXgeWCAZh65ByEumt",
    "slice_localization_data.zip":"1ZJamggugQuj-sE1EFWfghhR3dEY2K9Ib",
}

for file_name, file_id in DATA.items():
    output_file = os.path.join('data', file_name)
    gdd.download_file_from_google_drive(file_id=file_id, dest_path=output_file)

Downloading 1R3CXYssjftxi7HIXXgeWCAZh65ByEumt into data/YearPredictionMSD.txt.zip... Done.
Downloading 1ZJamggugQuj-sE1EFWfghhR3dEY2K9Ib into data/slice_localization_data.zip... Done.


In [8]:
data = pd.read_csv('data/YearPredictionMSD.txt.zip', header = None)

In [9]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [11]:
features = data.drop(0, axis=1)

In [13]:
y = data.iloc[:,0]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(features, y, random_state=24)
X_train = X_train.values
X_test = X_test.values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
%%time
reg = LinearSVR()
reg.fit(X_train, y_train)

CPU times: user 2min 49s, sys: 1.54 s, total: 2min 51s
Wall time: 2min 53s




LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [65]:
pred = reg.predict(X_test)
mean_squared_error(y_test,pred)

96.76618594248463

In [None]:
%%time
reg_rbf = SVR(kernel='rbf')
reg_rbf.fit(X_train, y_train)

In [None]:
pred_rbf = reg_rbf.predict(X_test)
mean_squared_error(y_test,pred_rbf)

In [62]:
%%time
d = features.shape[1]
D = 300
mu = np.zeros(d)
sigma = (1/(d*X_train.var()))*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 15.6 s, sys: 10.4 s, total: 26 s
Wall time: 25.8 s


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [63]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

93.18170635083891

In [10]:
data = pd.read_csv('data/slice_localization_data.zip')

In [11]:
data.head(5)

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


In [71]:
features = data.drop('reference', axis=1)

In [73]:
y = data.loc[:,'reference']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(features, y, random_state=24)
X_train = X_train.values
X_test = X_test.values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [75]:
%%time
reg = LinearSVR()
reg.fit(X_train, y_train)

CPU times: user 23.6 s, sys: 190 ms, total: 23.8 s
Wall time: 23.9 s




LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [76]:
pred = reg.predict(X_test)
mean_squared_error(y_test,pred)

78.91560084452502

In [79]:
%%time
reg_rbf = SVR(kernel='rbf')
reg_rbf.fit(X_train, y_train)

CPU times: user 21min 43s, sys: 2.74 s, total: 21min 46s
Wall time: 21min 49s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [80]:
pred_rbf = reg_rbf.predict(X_test)
mean_squared_error(y_test,pred_rbf)

15.811711263113542

In [77]:
%%time
d = features.shape[1]
D = 300
mu = np.zeros(d)
sigma = (1/(d*X_train.var()))*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 3.29 s, sys: 376 ms, total: 3.66 s
Wall time: 1.62 s


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [78]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

64.9681286622133

In [81]:
%%time
d = features.shape[1]
D = 1000
mu = np.zeros(d)
sigma = (1/(d*X_train.var()))*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 8.15 s, sys: 1.88 s, total: 10 s
Wall time: 6.44 s


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [82]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

42.77334182642288

In [83]:
%%time
d = features.shape[1]
D = 2000
mu = np.zeros(d)
sigma = (1/(d*X_train.var()))*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 15.1 s, sys: 4.12 s, total: 19.2 s
Wall time: 14.2 s


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [84]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

35.48678041435851

In [85]:
%%time
d = features.shape[1]
D = 3000
mu = np.zeros(d)
sigma = (1/(d*X_train.var()))*np.identity(d)
w = np.random.multivariate_normal(mu, sigma, D)
Z_train = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_train, w.T)), np.sin(np.dot(X_train, w.T))), axis=1)
Z_test = np.sqrt(1/D) * np.concatenate((np.cos(np.dot(X_test, w.T)), np.sin(np.dot(X_test, w.T))), axis=1)
reg_rff = LinearSVR()
reg_rff.fit(Z_train, y_train)

CPU times: user 22.4 s, sys: 12.3 s, total: 34.7 s
Wall time: 32.6 s


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [86]:
pred_rff = reg_rff.predict(Z_test)
mean_squared_error(y_test,pred_rff)

33.35117083225104