In [5]:
%matplotlib inline

In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, log_loss
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

print(__doc__)

#read the csv file into dataframe
dataset = pd.read_csv("GPR/LALearnOzoneGPR.csv")
dataset.head()




Automatically created module for IPython interactive environment


Unnamed: 0,DMax Ozone,Year,Month,Day,OZONE-WEIGHT,Flag,Site Name,Lat,Lng,Date
0,0.037,2016,1,1,17,0,Los Angeles-North Main Street,34.06659,-118.22688,1/1/2016
1,0.037,2016,1,1,17,0,Los Angeles-North Main Street,34.06659,-118.22688,1/1/2016
2,0.03,2016,1,7,13,0,Los Angeles-North Main Street,34.06659,-118.22688,1/7/2016
3,0.017,2016,1,13,17,0,Los Angeles-North Main Street,34.06659,-118.22688,1/13/2016
4,0.017,2016,1,13,17,0,Los Angeles-North Main Street,34.06659,-118.22688,1/13/2016


In [8]:
#target
y = dataset.iloc[:,0]

#dataset
X = dataset.iloc[:,0:6]
X.columns
#X.drop(['Site Name'],axis=1)
X

Unnamed: 0,DMax Ozone,Year,Month,Day,OZONE-WEIGHT,Flag
0,0.037,2016,1,1,17,0
1,0.037,2016,1,1,17,0
2,0.030,2016,1,7,13,0
3,0.017,2016,1,13,17,0
4,0.017,2016,1,13,17,0
...,...,...,...,...,...,...
538,0.027,2020,12,5,17,0
539,0.037,2020,12,11,18,0
540,0.031,2020,12,17,17,0
541,0.022,2020,12,23,17,0


In [9]:
def Ozone_prep():

    months = []
    ppmv_sums = []
    counts = []
    
    m = X['Month']
    year = X['Year']
    month_float = year + (m - 1) / 12
    ppmvs = dataset.iloc[:,0]

    for month, ppmv in zip(month_float, ppmvs):
            if not months or month != months[-1]:
                months.append(month)
                ppmv_sums.append(ppmv)
                counts.append(1)
            else:
                # aggregate monthly sum to produce average
                ppmv_sums[-1] += ppmv
                counts[-1] += 1
            
    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs
 
X, y = Ozone_prep()  

train_size = 25
# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))


# Plot posteriors
plt.figure()
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
         label="Optimized kernel: %s" % gp_opt.kernel_)
plt.xlabel("Feature")
plt.ylabel("Class 1 probability")
plt.xlim(0, 5)
plt.ylim(-0.25, 1.5)
plt.legend(loc="best")

# Plot LML landscape
plt.figure()
theta0 = np.logspace(0, 8, 30)
theta1 = np.logspace(-1, 1, 29)
Theta0, Theta1 = np.meshgrid(theta0, theta1)
LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
LML = np.array(LML).T
plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
         'ko', zorder=10)
plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
         'ko', zorder=10)
plt.pcolor(Theta0, Theta1, LML)
plt.xscale("log")
plt.yscale("log")
plt.colorbar()
plt.xlabel("Magnitude")
plt.ylabel("Length-scale")
plt.title("Log-marginal-likelihood")

plt.show()

ValueError: Unknown label type: (array([0.0262    , 0.0395    , 0.0448    , 0.05      , 0.0415    ,
       0.05      , 0.0525    , 0.04975   , 0.0475    , 0.0416    ,
       0.0354    , 0.02666667, 0.0246    , 0.03266667, 0.04016667,
       0.0398    , 0.0364    , 0.0366    , 0.0374    , 0.036     ,
       0.035     , 0.031     , 0.0295    , 0.0212    , 0.0396    ]),)