In [33]:
import numpy as np


np.random.seed(42)

In [46]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [35]:
dataset = datasets.fetch_california_housing()
x, y = dataset.data, dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [36]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

### Linear Regression

In [37]:
regr = LinearRegression()
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.5957702326061664
MSE: 0.5305677824766752


### Poly Regression

In [38]:
poly = PolynomialFeatures(degree=2)
poly.fit(x_train)
x_train_ = poly.transform(x_train)
x_test_ = poly.transform(x_test)

regr = LinearRegression()
regr.fit(x_train_, y_train)
y_pred = regr.predict(x_test_)

r2 = regr.score(x_test_, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.6533650021228887
MSE: 0.4549723374856818


### GradientBoosting Regression

In [39]:
regr = GradientBoostingRegressor(n_estimators=100)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.7803457772969118
MSE: 0.2883049771484818


### RandomForest Regression

In [40]:
regr = RandomForestRegressor(n_estimators=100)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.8045428094524978
MSE: 0.25654540195421344


### SVR Regression

In [41]:
regr = SVR(kernel="linear")
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.5776795610787477
MSE: 0.5543125144336986


In [42]:
regr = SVR(kernel="rbf")
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.7336828556053009
MSE: 0.34955193341649854


In [43]:
regr = SVR(kernel="poly", degree=3, C=0.7)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: 0.1634301710382723
MSE: 1.0980314534992512


In [44]:
regr = SVR(kernel="sigmoid")
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

r2 = regr.score(x_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"MSE: {mse}")

R2: -8391.053840757677
MSE: 11014.907252927826


### PCA + GridSearchCV

In [50]:
pca = PCA(n_components=0.90)
pca.fit(x_train)
x_train_transformed = pca.transform(x_train)
x_test_transformed = pca.transform(x_test)

print(f"Dimensions before: {np.prod(x_train.shape[1:])}")
print(f"Dimensions to keep: {len(pca.components_ )}")
print(f"Explained Variance: {round(sum(pca.explained_variance_ratio_ ), 8)}")

Dimensions before: 8
Dimensions to keep: 5
Explained Variance: 0.90176375


In [51]:
parameters = {"kernel": ["rbf", "poly", "linear", "sigmoid"]}

clf = SVR()
grid_cv = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
_ = grid_cv.fit(x_train_transformed, y_train)

In [52]:
print(f"Best parameters set found on development set: {grid_cv.best_params_}\n")

means = grid_cv.cv_results_["mean_test_score"]
stds = grid_cv.cv_results_["std_test_score"]

for mean, std, params in zip(means, stds, grid_cv.cv_results_["params"]):
    print(f"{mean:.3f} (+/-{2*std:.3f}) for {params}")

Best parameters set found on development set: {'kernel': 'rbf'}

0.557 (+/-0.007) for {'kernel': 'rbf'}
-166.779 (+/-470.352) for {'kernel': 'poly'}
0.188 (+/-0.646) for {'kernel': 'linear'}
-5011.449 (+/-1636.225) for {'kernel': 'sigmoid'}
