In [58]:
import torch
import gpytorch 
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import multivariate_normal
from scipy.spatial.distance import cdist
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

os.environ["OMP_NUM_THREADS"] = "2"

def data_import(path1, delimiter):
    #locate file path and import data
    if delimiter == 'none':
        file1 = pd.read_csv(path1)
    else:   
        file1 = pd.read_csv(path1, delimiter=delimiter)
    return file1


def data_frequency(file1, desired_frequency: str):
    if desired_frequency == 'ten_minute':
        file1 = file1.resample('10T', on='Time(UTC)').mean()
    elif desired_frequency == '4_hourly':
        file1 = file1.resample('4h', on='Time(UTC)').mean()
    elif desired_frequency == 'hourly':
        file1 = file1.resample('h', on='Time(UTC)').mean()
    elif desired_frequency == 'twelve_hourly':
        file1 = file1.resample('12h', on='Time(UTC)').mean()
    elif desired_frequency == 'daily':
        file1 = file1.resample('d', on='Time(UTC)').mean()

    # Ensure the index is datetime
    file1.index = pd.to_datetime(file1.index)
    
    # Reset the index and name it 'Time(UTC)'
    file1.reset_index(inplace=True)
    file1.rename(columns={file1.index.name: 'Time(UTC)'}, inplace=True)

    return file1

def time_to_sincos(df):
    sin_values = []
    cos_values = []

    for i in range(len(df)):
        sin_values.append(np.sin((2 * np.pi * i) / 365.25))
        cos_values.append(np.cos((2 * np.pi * i) / 365.25))
        
    df['Time_sin'] = sin_values
    df['Time_cos'] = cos_values
    
    return df







In [59]:
#importing s2 and m1 data sets
s2_site_data= data_import("C:\\Users\\396760\\lanl\\data\\ARMSAILS2_cleaned.csv", 'none')
m1_site_data = data_import("C:\\Users\\396760\\lanl\\data\\ARMSAILM1_cleaned.csv", 'none')

#remove pressure data from sets
s2_site_data.drop(columns=['sample_pres_mmHg'], inplace=True)
m1_site_data.drop(columns=['sample_pres_mmHg'], inplace=True)

#drop rows with even just one nan value
s2_site_data.dropna(inplace=True)
m1_site_data.dropna(inplace=True)


In [60]:
#append sine and cosine values to the data
s2_site_data = time_to_sincos(s2_site_data)
m1_site_data = time_to_sincos(m1_site_data)

#drop entire rows with even just one NaN value
s2_site_data.dropna(inplace=True)
m1_site_data.dropna(inplace=True)

# 'Time(UTC)' column to dt format
s2_site_data['Time(UTC)'] = pd.to_datetime(s2_site_data['Time(UTC)'])
m1_site_data['Time(UTC)'] = pd.to_datetime(m1_site_data['Time(UTC)'])

""" #ensure the nan's are removed
print(s2_site_data.isnull().sum())
print(m1_site_data.isnull().sum())
 """

" #ensure the nan's are removed\nprint(s2_site_data.isnull().sum())\nprint(m1_site_data.isnull().sum())\n "

In [61]:
"the data was collected over a 12 month span. partition the data into monthly sets. to partition into different sets, use variables in above cell"

#s2_month(i), where i is the number of month.
def partition_data(data, start_month, end_month):
    data = data.loc[(data['Time(UTC)'].dt.month >= start_month) & (data['Time(UTC)'].dt.month <= end_month)]
    return data

for i in range(1, 13):
    globals()['s2_month' + str(i)] = partition_data(s2_site_data, i, i)
    globals()['m1_month' + str(i)] = partition_data(m1_site_data, i, i)

#remove the 'Time(UTC)' column from the data
for i in range(1, 13):
    s2_month = globals()['s2_month' + str(i)].drop(columns=['Time(UTC)'])
    m1_month = globals()['m1_month' + str(i)].drop(columns=['Time(UTC)'])
    globals()['s2_month' + str(i)] = s2_month
    globals()['m1_month' + str(i)] = m1_month
    
""" def check_dim(data):
    print(data.shape) """

""" #access pm1 concentration collumn, calculate mean and print
def mean_pm1(data):
    pm1 = data['pm_1_ug_per_m3']
    mean = np.mean(pm1)
    return mean """

""" for i in range(1,13):
    s2_totalmean = mean_pm1(globals()['s2_month' + str(i)])
    m1_totalmean = mean_pm1(globals()['m1_month' + str(i)])

    print(mean_pm1(globals()['s2_month' + str(i)]))

    print(mean_pm1(globals()['m1_month' + str(i)]))

print('avg mean here')
print(s2_totalmean/12)
print(m1_totalmean/12) """

""" for i in range(1, 13):
    check_dim(globals()['s2_month' + str(i)])
    check_dim(globals()['m1_month' + str(i)]) """

#gpytorch requires data to be in tensor format
def to_tensor(data):
    data = torch.tensor(data.values)
    return data

for i in range(1, 13):
    globals()['s2_month' + str(i)] = to_tensor(globals()['s2_month' + str(i)])
    globals()['m1_month' + str(i)] = to_tensor(globals()['m1_month' + str(i)])

""" 
for i in range(1, 13):
    check_dim(globals()['s2_month' + str(i)])
    check_dim(globals()['m1_month' + str(i)])   
 """



" \nfor i in range(1, 13):\n    check_dim(globals()['s2_month' + str(i)])\n    check_dim(globals()['m1_month' + str(i)])   \n "

In [64]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ZeroMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


# Initialize Gaussian likelihood
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x, train_y, likelihood)
training_iter = 50
# Find optimal model hyperparameters
model.train()
likelihood.train()
# Use the adam optimizer, includes GaussianLikelihood parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
# Set our loss as the negative log GP marginal likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)



for i in range(training_iter):
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(train_x)
    # Calc loss and backprop gradients
    loss = -mll(output, train_y)
    loss.backward()
    if i % 10 == 0:
        print(f'Iter {i+1:d}/{training_iter:d} - Loss: {loss.item():.3f} '
              f'squared lengthscale: '
              f'{model.covar_module.base_kernel.lengthscale.item():.3f} '
              f'noise variance: {model.likelihood.noise.item():.3f}')
    optimizer.step()

test_x = torch.tensor(test_x)
model.eval()
likelihood.eval()
observed_pred = likelihood(model(test_x))

In [67]:
# Ensure that your 'Time(UTC)' column is already in datetime format
s2_site_data['Time(UTC)'] = pd.to_datetime(s2_site_data['Time(UTC)'])

# Resample the data to daily frequency
s2_daily = data_frequency(s2_site_data, 'daily')

# Set the target variable and feature variables
target_var = 'pm_1_ug_per_m3'
feature_vars = ['Time_sin', 'Time_cos', 'sample_rh_pct', 'sample_temp_C']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

trained_model, likelihood = train_gp_model(X_train_tensor, y_train_tensor)
observed_pred_train = predict_gp_model(trained_model,likelihood, X_test_tensor)


Iter 1/200 - Loss: 20.129207611083984
Iter 11/200 - Loss: 17.84337043762207
Iter 21/200 - Loss: 15.242698669433594
Iter 31/200 - Loss: 13.218689918518066
Iter 41/200 - Loss: 11.792628288269043
Iter 51/200 - Loss: 10.858305931091309
Iter 61/200 - Loss: 10.149012565612793
Iter 71/200 - Loss: 9.597676277160645
Iter 81/200 - Loss: 9.166234016418457
Iter 91/200 - Loss: 8.80482292175293
Iter 101/200 - Loss: 8.498921394348145
Iter 111/200 - Loss: 8.235713958740234
Iter 121/200 - Loss: 8.010125160217285
Iter 131/200 - Loss: 7.810035228729248
Iter 141/200 - Loss: 7.627243518829346
Iter 151/200 - Loss: 7.463432312011719
Iter 161/200 - Loss: 7.323135852813721
Iter 171/200 - Loss: 7.1839680671691895
Iter 181/200 - Loss: 7.06618595123291
Iter 191/200 - Loss: 6.9524641036987305
Final noise variance: 5.483
