In [1]:
import numpy as np
import time

from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.gaussian_process import GaussianProcessRegressor 
from sklearn.gaussian_process.kernels import (
    RBF,
    WhiteKernel,
    RationalQuadratic,
    ExpSineSquared,
    ConstantKernel
)

from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

import pandas as pd

obs_df = pd.read_csv('CA_HomePrice_Obs.csv')
val_df = pd.read_csv('CA_HomePrice_Val.csv')
test_df = pd.read_csv('CA_HomePrice_Test.csv')

# display(df)
# b = df.iloc[:,0:1].values
# print(b)

# Specify a GP prior
# kernel = 1 * RBF(length_scale = 1)
# gp = GaussianProcessRegressor(kernel = kernel, optimizer = None)
# print("Initial Kernel\n%s" % kernel)

obs_df_array = np.asarray(obs_df)
obs_price = obs_df_array[:, 0]
obs_income = obs_df_array[:, 1]
obs_lat = obs_df_array[:, 7]
obs_long = np.absolute(obs_df_array[:, 8])
obs_dist_coast = np.absolute(obs_df_array[:, 9])
obs_br_density = np.absolute(obs_df_array[:, 14])
obs_avg_rms = np.absolute(obs_df_array[:, 15])
obs_dist_city = np.absolute(obs_df_array[:, 16])

val_df_array = np.asarray(val_df)
val_price = val_df_array[:, 0]
val_income = val_df_array[:, 1]
val_lat = val_df_array[:, 7]
val_long = np.absolute(val_df_array[:, 8])
val_dist_coast = np.absolute(val_df_array[:, 9])
val_br_density = np.absolute(val_df_array[:, 14])
val_avg_rms = np.absolute(val_df_array[:, 15])
val_dist_city = np.absolute(val_df_array[:, 16])

test_df_array = np.asarray(test_df)
test_price = test_df_array[:, 0]
test_income = test_df_array[:, 1]
test_lat = test_df_array[:, 7]
test_long = np.absolute(test_df_array[:, 8])
test_dist_coast = np.absolute(test_df_array[:, 9])
test_br_density = np.absolute(test_df_array[:, 14])
test_avg_rms = np.absolute(test_df_array[:, 15])
test_dist_city = np.absolute(test_df_array[:, 16])

X_train = np.asarray([obs_income, obs_lat, obs_long, obs_dist_coast, obs_br_density, obs_avg_rms, obs_dist_city]).T
Y_train = obs_price.T
x_test = np.asarray([test_income, test_lat, test_long, test_dist_coast, test_br_density, test_avg_rms, test_dist_city]).T

y_actual = test_price.T

# Kernel with parameters given in GPML book
k0 = 1 * RBF(length_scale = 1)
k1 = 66.0 ** 2 * RBF(length_scale=67.0)  # long term smooth rising trend
k2 = (
    2.4 ** 2
    * RBF(length_scale=90.0)
    * ExpSineSquared(length_scale=1.3, periodicity=1.0)
)  # seasonal component
# medium term irregularity
k3 = 0.66 ** 2 * RationalQuadratic(length_scale=1.2, alpha=0.78)
k4 = 0.18 ** 2 * RBF(length_scale=0.134) + WhiteKernel(
    noise_level=0.19 ** 2
)  # noise terms

k5 = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=10.0)

# kernel_gpml = k1 + k2 + k3 + k4
kernel_gpml = k5

gp = GaussianProcessRegressor(
    kernel=kernel_gpml, optimizer = 'fmin_l_bfgs_b',alpha = 1.5, n_restarts_optimizer=10
)
stime = time.time()
gp.fit(X_train, Y_train)
print("Time for fitting - latitude + longitude: %.3f" % (time.time() - stime))

print("GPML kernel: %s" % gp.kernel_)
print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta))

y_pred, y_std = gp.predict(x_test, return_std=True)

mse_all3 = np.mean((y_actual - y_pred)**2)

# display(y_actual)
# display(y_pred)

print("MSE, all features 3: ")
display(mse_all3)



Time for fitting - latitude + longitude: 10838.776
GPML kernel: 31.6**2 * RBF(length_scale=0.914)
Log-marginal-likelihood: -399108338600.936
MSE, all features 3: 


36479063110.06433