In [None]:
%matplotlib inline
import skmob
import pandas as pd
import skmob.measures.individual as ind_measure
import torch
import gpytorch
from gpytorch.kernels import RQKernel as RQ, RBFKernel as SE, \
PeriodicKernel as PER, ScaleKernel, LinearKernel as LIN, MaternKernel as MAT, \
SpectralMixtureKernel as SMK, PiecewisePolynomialKernel as PPK, CylindricalKernel as CYL
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from gpytorch.constraints import Interval
import time

# Import intra-package scripts
import utils.helper_func as helper_func
import utils.GP as GP
import mobileDataToolkit.analysis as analysis
import mobileDataToolkit.preprocessing_v2 as preprocessing
import mobileDataToolkit.methods as methods
import mobileDataToolkit.metrics as metrics

import warnings
warnings.filterwarnings('ignore')

## Reading Data

In [None]:
training_index = None

In [None]:
file_path = "C:\\Users\\ekino\\OneDrive - UW\\GPR\\Data\\seattle_2000_all_obs_preprocessed_sampled.csv"

d1 = preprocessing.dp_MultiTrip(file_path=file_path, random_state=8765765)
d1.chooseUser(d1.data.UID.unique()[0])

In [None]:
d1.data.columns

## Adding "DateTime" column, subsetting by a timeframe

In [None]:
d1.data['datetime'] = pd.to_datetime(d1.data['datetime'])

d1.subsetByTime('01/18/2020 00:00', '02/07/2020 00:00')

d1.Multi_Trip_Preprocess()

In [None]:
d1.data.columns

#### Add distance and velocity columns, determine home location

In [None]:
helper_func.addDist(d1.data)
helper_func.addVel(d1.data)
home_lat, home_lon = methods.homeLoc(d1.data)

In [None]:
d1.data['vel'].describe()

In [None]:
d1.data['dist'].describe()

Choose 45.0 as the diff we want to achieve

## Further Preprocessing: Filtering, compression, stay location detection, and stay location clustering

In [None]:
%%time
tdf = skmob.TrajDataFrame(d1.data, latitude='orig_lat', longitude='orig_long', datetime='Date_Time')
f_tdf = skmob.preprocessing.filtering.filter(tdf, max_speed_kmh=400, include_loops=True)
fc_tdf = skmob.preprocessing.compression.compress(f_tdf, spatial_radius_km=0.1) # vary the spatial radius to see how prediction changes

In [None]:
n_deleted_points = len(tdf) - len(f_tdf) # number of deleted points during filtering
print(n_deleted_points)

In [None]:
n_deleted_points = len(f_tdf) - len(fc_tdf) # number of deleted points during compression
n_deleted_points / len(d1.data)

In [None]:
fc_tdf.columns

## Train/Test Split

In [None]:
# Move unix_min column (7th index) to before SaM column (15th index)
cols = list(fc_tdf.columns)
cols = cols[:8] + cols[15:16] + cols[8:15] + cols[16:]
fc_tdf = fc_tdf[cols]

fc_tdf.columns

In [None]:
#d1.Multi_Trip_TrainTestSplit('2020-02-01 00:00', '2020-02-07 00:00')
df = preprocessing.dp_MultiTrip(data=fc_tdf)

df.Multi_Trip_TrainTestSplit(test_start_date='2020-02-01 00:00', test_end_date='2020-02-07 00:00', lat='lat', long='lng', inputend='day_6', training_index=None)

In [None]:
unix_min_tr = np.array(df.X_train[:,0]).astype(int)
unix_min_te = np.array(df.X_test[:,0]).astype(int)

### Normalize data (min-max)

In [None]:
scaler = StandardScaler()
y_train_scaled = torch.tensor(np.float64(scaler.fit_transform(df.y_train)))
y_test_scaled = torch.tensor(np.float64(scaler.transform(df.y_test)))

n_dims = df.X_train.shape[1]

In [None]:
model = GP.MTGPRegressor(df.X_train, y_train_scaled, ScaleKernel( RQ(ard_num_dims = n_dims) * PER(active_dims=torch.tensor([0]))) + ScaleKernel(RQ(ard_num_dims = n_dims) * PER(active_dims=torch.tensor([0]))))

In [None]:
init_period_len_1 = 60*8 # 8 hours
init_period_len_2 = 60*24 # 24 hours

# Set initial lengthscale guess as half the average length of gap in training set
init_lengthscale = df.data[df.data['unix_min'].isin(set(df.data['unix_min']))]['unix_min'].diff().mean() / 2 
initializations = np.ones(n_dims - 1)
initializations = np.insert(initializations, 0, init_lengthscale)
model.covar_module.data_covar_module.kernels[0].base_kernel.kernels[0].lengthscale = initializations
model.covar_module.data_covar_module.kernels[1].base_kernel.kernels[0].lengthscale = initializations

# Set initial period lengths
model.covar_module.data_covar_module.kernels[0].base_kernel.kernels[1].period_length = init_period_len_1
model.covar_module.data_covar_module.kernels[1].base_kernel.kernels[1].period_length = init_period_len_2

In [None]:
%%time
ls, mll = GP.training(model, df.X_train, y_train_scaled, n_epochs=100, lr=0.1, verbose=True)

In [None]:
# Plot the loss curve
plt.plot(ls)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

## Temporal Occupancy

In [None]:
test_TO = analysis.tempOcp(df.test, bin_len=20)
test_TO

## Defining the model

In [None]:
X_train = torch.tensor(df.X_train).float()
X_test = torch.tensor(df.X_test).float()
y_train = torch.tensor(df.y_train).float()
y_test = torch.tensor(df.y_test).float()

In [None]:
n_dims = X_train.shape[1]

model = GP.MTGPRegressor(X_train, y_train, ScaleKernel( RQ(ard_num_dims = n_dims) * PER(active_dims=torch.tensor([0]))) + ScaleKernel(RQ(ard_num_dims = n_dims) * PER(active_dims=torch.tensor([0]))))

### Initializiaing the model (without normalization)

In [None]:
init_period_len_1 = 60*8 # 8 hours
init_period_len_2 = 60*24 # 24 hours

# Set initial lengthscale guess as half the average length of gap in training set
init_lengthscale = df.data[df.data['unix_min'].isin(set(df.data['unix_min']))]['unix_min'].diff().mean() / 2 
initializations = np.ones(n_dims - 1)
initializations = np.insert(initializations, 0, init_lengthscale)
model.covar_module.data_covar_module.kernels[0].base_kernel.kernels[0].lengthscale = initializations
model.covar_module.data_covar_module.kernels[1].base_kernel.kernels[0].lengthscale = initializations

# Set initial period lengths
model.covar_module.data_covar_module.kernels[0].base_kernel.kernels[1].period_length = init_period_len_1
model.covar_module.data_covar_module.kernels[1].base_kernel.kernels[1].period_length = init_period_len_2

In [None]:
%%time
ls, mll = GP.training(model, X_train, y_train, n_epochs=100, lr=0.1, verbose=True)

In [None]:
# Plot the loss curve
plt.plot(ls)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
with torch.no_grad():
    log_ll = mll(model(X_train), y_train) * X_train.shape[0]
            
N = X_train.shape[0]
m = sum(p.numel() for p in model.hyperparameters())
bic = -2 * log_ll + m * np.log(N)

In [None]:
predictions, mean = model.predict(X_test)

In [None]:
model.plot_preds(mean, date_train, date_test, y_train, y_test)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(y_train[:,0])), y_train[:,0], label='train', linewidth=2)
ax1.plot(range(len(y_train[:,0]), len(y_train[:,0])+len(y_test[:,0])), y_test[:,0], label='test', linewidth=2)
ax1.plot(range(len(y_train[:,0]), len(y_train[:,0])+len(mean[:,0])), mean[:,0], label='pred', linewidth=3)

ax2.plot(range(len(y_train[:,1])), y_train[:,1], label='train', linewidth=2)
ax2.plot(range(len(y_train[:,1]), len(y_train[:,1])+len(y_test[:,1])), y_test[:,1], label='test', linewidth=2)
ax2.plot(range(len(y_train[:,1]), len(y_train[:,1])+len(mean[:,1])), mean[:,1], label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(pd.Series(y_test[:,0]), pd.Series(y_test[:,1]), pd.Series(mean[:,0]), pd.Series(mean[:,1]))

### Benchmarking

In [None]:
from statsmodels.tsa.holtwinters import Holt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
lat = pd.Series(y_train[:,0].tolist(), unix_min_tr)
lat_t = pd.Series(y_test[:,0].tolist(), unix_min_te)
# Replace duplicates (in time) with the mean of the two values
lat = lat.groupby(lat.index).mean().reset_index()
lat = pd.Series(lat[0].tolist(), lat['index'].tolist())
lat_tc = lat_t.groupby(lat_t.index).mean().reset_index()
lat_tc = pd.Series(lat_tc[0].tolist(), lat_tc['index'].tolist())
# Replace zeroes with positives close to zero
lat.replace(0, 0.000000001, inplace=True)


lon = pd.Series(y_train[:,1].tolist(), unix_min_tr)
lon_t = pd.Series(y_test[:,1].tolist(),unix_min_te)
# Replace duplicates (in time) with the mean of the two values
lon = lon.groupby(lon.index).mean().reset_index()
lon = pd.Series(lon[0].tolist(), lon['index'].tolist())
lon_tc = lon_t.groupby(lon_t.index).mean().reset_index()
lon_tc = pd.Series(lon_tc[0].tolist(), lon_tc['index'].tolist())
# Replace zeroes with positives close to zero
lon.replace(0, 0.000000001, inplace=True)

### SimpleExpSmoothing
`smoothing_level = 0.1`

In [None]:
smoothing_level = 0.1
ses_lat = SimpleExpSmoothing(lat, initialization_method="heuristic").fit(smoothing_level=smoothing_level, optimized=True)
pred_lat = ses_lat.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lat_comp = pred_lat[pred_lat.index.isin(unix_min_te)]

ses_lon = SimpleExpSmoothing(lon, initialization_method="heuristic").fit(smoothing_level=smoothing_level, optimized=True)
pred_lon = ses_lon.predict(start=lon_tc.index[0], end=lon_tc.index[-1])
pred_lon_comp = pred_lon[pred_lon.index.isin(unix_min_te)]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(lat)), lat, label='train', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(lat_tc)), lat_tc, label='test', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(pred_lat_comp)), pred_lat_comp, label='pred', linewidth=3)

ax2.plot(range(len(lon)), lon, label='train', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(lon_tc)), lon_tc, label='test', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(pred_lon_comp)), pred_lon_comp, label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(lat_tc, lon_tc, pred_lat_comp, pred_lon_comp)

### Holt
`smoothing_level_lat=0.2`
`smoothing_slope_lat=0.045`

`smoothing_level_lon=0.1`
`smoothing_slope_lon=0.0307`

In [None]:
smoothing_level_lat=0.2
smoothing_slope_lat=0.045

holt = Holt(lat, damped_trend=True, initialization_method="estimated").fit(smoothing_level=smoothing_level_lat, smoothing_slope=smoothing_slope_lat)
pred_lat = holt.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lat_comp = pred_lat[pred_lat.index.isin(unix_min_te)]

smoothing_level_lon=0.1
smoothing_slope_lon=0.0307

holt = Holt(lon, damped_trend=True, initialization_method="estimated").fit(smoothing_level=smoothing_level_lon, smoothing_slope=smoothing_slope_lon)
pred_lon = holt.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lon_comp = pred_lon[pred_lon.index.isin(unix_min_te)]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(lat)), lat, label='train', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(lat_tc)), lat_tc, label='test', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(pred_lat_comp)), pred_lat_comp, label='pred', linewidth=3)

ax2.plot(range(len(lon)), lon, label='train', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(lon_tc)), lon_tc, label='test', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(pred_lon_comp)), pred_lon_comp, label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(lat_tc, lon_tc, pred_lat_comp, pred_lon_comp)

### Exponential Smoothing
`seasonal_periods=36`

In [None]:
seasonal_periods=36

es = ExponentialSmoothing(lat, seasonal_periods=seasonal_periods, trend='add', seasonal='add', damped_trend=True, use_boxcox=True, initialization_method='estimated').fit()
pred_lat = es.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lat_comp = pred_lat[pred_lat.index.isin(unix_min_te)]

es = ExponentialSmoothing(lon, seasonal_periods=seasonal_periods, trend='add', seasonal='add', damped_trend=True, use_boxcox=True, initialization_method='estimated').fit()
pred_lon = es.predict(start=lon_tc.index[0], end=lon_tc.index[-1])
pred_lon_comp = pred_lon[pred_lon.index.isin(unix_min_te)]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(lat)), lat, label='train', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(lat_tc)), lat_tc, label='test', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(pred_lat_comp)), pred_lat_comp, label='pred', linewidth=3)

ax2.plot(range(len(lon)), lon, label='train', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(lon_tc)), lon_tc, label='test', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(pred_lon_comp)), pred_lon_comp, label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(lat_tc, lon_tc, pred_lat_comp, pred_lon_comp)

### ARIMA
`order=(2, 1, 0)`

In [None]:
arima = ARIMA(lat, order=(1,1,0)).fit()
pred_lat = arima.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lat_comp = pred_lat[pred_lat.index.isin(unix_min_te)]

arima = ARIMA(lon, order=(1,1,0)).fit()
pred_lon = arima.predict(start=lon_tc.index[0], end=lon_tc.index[-1])
pred_lon_comp = pred_lon[pred_lon.index.isin(unix_min_te)]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(lat)), lat, label='train', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(lat_tc)), lat_tc, label='test', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(pred_lat_comp)), pred_lat_comp, label='pred', linewidth=3)

ax2.plot(range(len(lon)), lon, label='train', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(lon_tc)), lon_tc, label='test', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(pred_lon_comp)), pred_lon_comp, label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(lat_tc, lon_tc, pred_lat_comp, pred_lon_comp)

### SARIMAX
`order=(1, 0, 0)`
`seasonal_order=(1, 1, 1, 24)`

In [None]:
sarimax_lat = SARIMAX(lat, order=(1,0,0), seasonal_order=(1, 1, 1, 24)).fit(disp=False)
pred_lat = sarimax_lat.predict(start=lat_tc.index[0], end=lat_tc.index[-1])
pred_lat_comp = pred_lat[pred_lat.index.isin(unix_min_te)]

sarimax_lon = SARIMAX(lon, order=(1,0,0), seasonal_order=(1, 1, 1, 24)).fit(disp=False)
pred_lon = sarimax_lon.predict(start=lon_tc.index[0], end=lon_tc.index[-1])
pred_lon_comp = pred_lon[pred_lon.index.isin(unix_min_te)]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, constrained_layout = True)
ax1.plot(range(len(lat)), lat, label='train', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(lat_tc)), lat_tc, label='test', linewidth=2)
ax1.plot(range(len(lat), len(lat)+len(pred_lat_comp)), pred_lat_comp, label='pred', linewidth=3)

ax2.plot(range(len(lon)), lon, label='train', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(lon_tc)), lon_tc, label='test', linewidth=2)
ax2.plot(range(len(lon), len(lon)+len(pred_lon_comp)), pred_lon_comp, label='pred', linewidth=3)

#ax1.legend()
plt.show()

In [None]:
metrics.average_eval(lat_tc, lon_tc, pred_lat_comp, pred_lon_comp)

In [None]:
sarimax_lat.plot_diagnostics()

In [None]:
tot = lat_t.index[-1] - lat_t.index[0]
bin_len = 15
pred_amt = int(np.ceil(tot / bin_len))

## Deep Learning

In [None]:
def prepare (y,w):
    XX = np.array( [ y[i:i+w] for i in range(len(y)-w) ] )
    YY = np.array( y[w:])
    return XX, YY

window = 100
train_x, train_y = prepare(lat, window)
test_x, test_y = prepare(lat_tc, window)

print (train_x.shape)
print (train_y.shape)

print (test_x.shape)
print (test_y.shape)

In [None]:
np.array([lat[i:i+window] for i in range(len(lat)-window)])

train_x

### Short Gap Simulation Design
* **Assumption**: MCAR (will remove points randomly)
* Use trip data only (include trips from the past)
* Divide each trip into X-min intervals
    + Sensitivity Analysis
        - Try 1, 5, 10, 15, 30-min intervals
* Introduce sparsity at 10% intervals
    + i.e., first remove 10% of the X-min intervals, then 20%, etc.
        - Repeat process 10 times at each q to allow for different portions of the data to be removed
 

In [None]:
tau = 5 # bin length
sparsity = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for i in range(10): # Cross-validation
    ### code below
    
    ###
    return  # output?