# GPyTorch
#### Supposed to be great!

In [1]:
pip install gpytorch #you need to have torch installed 

Collecting gpytorch
  Downloading gpytorch-1.10-py3-none-any.whl (255 kB)
     -------------------------------------- 255.2/255.2 kB 5.2 MB/s eta 0:00:00
Collecting linear-operator>=0.4.0
  Downloading linear_operator-0.4.0-py3-none-any.whl (156 kB)
     -------------------------------------- 156.7/156.7 kB 4.7 MB/s eta 0:00:00
Collecting torch>=1.11
  Downloading torch-2.0.0-cp39-cp39-win_amd64.whl (172.3 MB)
     ------------------------------------- 172.3/172.3 MB 12.3 MB/s eta 0:00:00
Installing collected packages: torch, linear-operator, gpytorch
Successfully installed gpytorch-1.10 linear-operator-0.4.0 torch-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [59]:
import math
from math import floor

from tqdm.notebook import trange, tqdm

import torch
from torch.utils.data import TensorDataset, DataLoader

import gpytorch
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy

from matplotlib import pyplot as plt


import pandas as pd
import numpy as np
import os

import urllib.request

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
os.chdir("C:/Users/eminu/OneDrive/Desktop/Air-Quality-StatsLab/Data/Rawdata")

In [66]:
df = pd.read_csv("data_cleaned_final.csv")
df_subsampled = df.sample(frac = 0.001)

In [67]:
dummy_df = pd.get_dummies(df_subsampled["device"], prefix = "device")
df_subsampled = pd.concat([df_subsampled, dummy_df], axis = 1)
df_subsampled = df_subsampled.drop("device", axis = 1)

X_large = pd.concat([df_subsampled.filter(regex="device_*"), df_subsampled[["lat", "lon", "industrial", "major.road",
                  "res.road"]]], axis = 1).to_numpy()
X = df_subsampled[["lat","lon"]].to_numpy()
X = torch.from_numpy(X).double()

Y = df_subsampled[["pm25_detrended_15"]].to_numpy()
Y = torch.from_numpy(Y).double()

In [74]:
train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = Y[:train_n].contiguous()

test_x = X[train_n:, :].contiguous()
test_y = Y[train_n:].contiguous()

if torch.cuda.is_available():
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

In [82]:
train_y.size(0), train_x.size(0)

(1619, 1619)

In [89]:
class GPModel(ApproximateGP):
    def __init__(self, inducing_points, likelihood, k=256, training_batch_size=256):

        m, d = inducing_points.shape
        self.m = m
        self.k = k

        variational_distribution = gpytorch.variational.MeanFieldVariationalDistribution(m)

        if torch.cuda.is_available():
            inducing_points = inducing_points.cuda()

        variational_strategy = NNVariationalStrategy(self, inducing_points, variational_distribution, k=k,
                                                     training_batch_size=training_batch_size)
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ZeroMean()
        self.covar_module = gpytorch.kernels.MaternKernel(nu=0.5, ard_num_dims=d)

        self.likelihood = likelihood

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

    def __call__(self, x, prior=False, **kwargs):
        if x is not None:
            if x.dim() == 1:
                x = x.unsqueeze(-1)
        return self.variational_strategy(x=x, prior=False, **kwargs)

    
k = 32
training_batch_size = 32
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPModel(inducing_points=train_x, likelihood=likelihood, k=k, training_batch_size=training_batch_size)

if torch.cuda.is_available():
    likelihood = likelihood.cuda()
    model = model.cuda()

In [57]:
#train_dataset = TensorDataset(train_x, train_y)
#train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

#test_dataset = TensorDataset(test_x, test_y)
#test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [91]:
num_epochs = 20
num_batches = model.variational_strategy._total_training_batches


model.train()
likelihood.train()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Our loss object. We're using the Variational ELBO
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))


epochs_iter = tqdm(range(num_epochs), desc="Epoch")
for epoch in epochs_iter:
    minibatch_iter = tqdm(range(num_batches), desc="Minibatch", leave=False)

    for i in minibatch_iter:
        optimizer.zero_grad()
        output = model(x=None)
        
        # Obtain the indices for mini-batch data
        current_training_indices = model.variational_strategy.current_training_indices
        
        # Obtain the y_batch using indices. It is important to keep the same order of train_x and train_y
        y_batch = train_y[...,current_training_indices]
        if torch.cuda.is_available():
            y_batch = y_batch.cuda()
        loss = -mll(output, y_batch)
        minibatch_iter.set_postfix(loss=loss.item())
        loss.backward()
        optimizer.step()

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Minibatch:   0%|          | 0/51 [00:00<?, ?it/s]

IndexError: index 701 is out of bounds for dimension 0 with size 1

In [93]:
# prepare for dataset
train_dataset = TensorDataset(train_x, train_y)
# this batch-size does not need to match the training-batch-size specified above
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

num_epochs = 20

model.train()
likelihood.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Our loss object. We're using the VariationalELBO
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))


epochs_iter = tqdm(range(num_epochs), desc="Epoch")
for i in epochs_iter:
    # Within each iteration, we will go over each minibatch of data
    minibatch_iter = tqdm(train_loader, desc="Minibatch", leave=False)
    for x_batch, y_batch in minibatch_iter:
        optimizer.zero_grad()
        output = model(x_batch)
        loss = -mll(output, y_batch)
        minibatch_iter.set_postfix(loss=loss.item())
        loss.backward()
        optimizer.step()

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Minibatch:   0%|          | 0/13 [00:00<?, ?it/s]

RuntimeError: a Tensor with 128 elements cannot be converted to Scalar

In [None]:
test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
model.eval()
likelihood.eval()
means = torch.tensor([0.])
test_mse = 0
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        preds = model(x_batch)
        means = torch.cat([means, preds.mean.cpu()])

        diff = torch.pow(preds.mean - y_batch, 2)
        diff = diff.sum(dim=-1) / test_x.size(0) # sum over bsz and scaling
        diff = diff.mean() # average over likelihood_nsamples
        test_mse += diff
means = means[1:]
test_rmse = test_mse.sqrt().item()