In [None]:
from glob import glob
from os.path import join
import os
import numpy as np
import itertools
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct, WhiteKernel, Matern
from sklearn.utils import resample
import matplotlib.pyplot as plt
import pandas as pd

# Data choice

In [None]:
experiment = 'december'

In [None]:
fruit = 'orange'

In [None]:
cut_quality = 'good' # good / shallow / deep

In [None]:
train_percentage = 0.7

# Segmented data reading

In [None]:
data_folder = join('..', 'data', 'segmented_data', experiment, fruit, cut_quality)

In [None]:
data_files = os.listdir(data_folder)
print(data_files)

In [None]:
timeseries = []

for f in data_files:
    data = pd.read_csv(join(data_folder, f), index_col=0)
    data['label'] = '-'.join([fruit, cut_quality])
    timeseries.append(data)

# Split preparation

In [None]:
def concat_downsample_and_sort(X, Y, num_samples, sort_by, random_state=42, reset_index=True):
    X_vector = pd.DataFrame(columns=X[0].columns)
    Y_vector = pd.DataFrame(columns=Y[0].columns)
    for x, y in zip(X, Y):
        X_vector = pd.concat([X_vector, x], axis=0, ignore_index=True)
        Y_vector = pd.concat([Y_vector, y], axis=0, ignore_index=True)
    indices = resample(X_vector.index, replace=False, n_samples=num_samples, random_state=random_state)
    X_vector = X_vector.iloc[indices]
    Y_vector = Y_vector.iloc[indices]
    X_vector = X_vector.sort_values(by=[sort_by])
    Y_vector = Y_vector.reindex(X_vector.index)
    if reset_index:
        X_vector.reset_index(inplace=True)
        Y_vector.reset_index(inplace=True)
    return X_vector, Y_vector
    

x_header = ['displacement']
y_header = ['force_x', 'force_y', 'force_z', 'label']

X_train, X_test, y_train, y_test = train_test_split([t[x_header] for t in timeseries], [t[y_header] for t in timeseries], test_size=1-train_percentage, random_state=42)

X_train_vector, y_train_vector = concat_downsample_and_sort(X_train, y_train, 1000, 'displacement')
X_test_vector, y_test_vector = concat_downsample_and_sort(X_test, y_test, 1000, 'displacement')


plt.plot(X_train_vector['displacement'],y_train_vector['force_x'], '.')
plt.show()

# ML pipeline

In [None]:
gpr = GaussianProcessRegressor(random_state=42)
print(gpr.get_params().keys())
rbf_kernel = RBF()
print(rbf_kernel.get_params().keys())
matern_kernel = 1.0 * Matern(length_scale_bounds=(1e-3, 1e3))
matern_kernel = Matern()
#matern_kernel = Matern()
print(matern_kernel.get_params().keys())

pipe = Pipeline(steps=[('gpr', gpr)])

## Parameter tuning

In [None]:
print(np.mean(np.std(X_train_vector['displacement'])))
print(np.std(y_train_vector)/np.sqrt(2))


### generate configurations to test

In [None]:
config1 = {'gpr__alpha': [0.2, 0.5, 0.7],
           'gpr__kernel': [rbf_kernel],
           'gpr__kernel__length_scale': [0.001, 0.01, 0.05]}

In [None]:
config2 = {'gpr__alpha': [1e-10, 0.2, 0.5, 0.8],
           'gpr__normalize_y': [True, False],
           'gpr__kernel': [matern_kernel],
           'gpr__kernel__k2__length_scale': [0.0001, 0.001, 0.01, 0.1, 1],
           'gpr__kernel__k2__nu': [0.001, 0.01, 0.1, 1, 10]}

In [None]:
tuned_parameters = [config2]

In [None]:
search = GridSearchCV(pipe, tuned_parameters, n_jobs=-1, verbose=2)

In [None]:
config = {'alpha': [0.6, 0.8],
          'kernel': [matern_kernel],
          #'kernel__length_scale': [0.001, 1],
          'kernel__nu': [0.001, 1]}
search = GridSearchCV(gpr, [config], n_jobs=-1, verbose=2)

## Best parameter search

In [None]:
search.fit(X_train_vector, y_train_vector['force_x'])
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
y_pr = search.predict(X_train_vector)
plt.plot(X_train_vector['displacement'], y_pr)
plt.plot(X_train_vector['displacement'], y_train_vector['force_x'] , '.')
plt.show()

In [None]:
matern_kernel = 1.0 * Matern(length_scale=1, length_scale_bounds=(1e-4, 1e1), nu=1)
gpr2 = GaussianProcessRegressor(kernel=search.best_params_['kernel'])
gpr2 = GaussianProcessRegressor(kernel=matern_kernel, alpha=0.5, normalize_y=True)
#gpr2 = GaussianProcessRegressor(alpha=search.best_params_['gpr__alpha'], kernel=search.best_params_['gpr__kernel'])
gpr2.fit(X_train_vector, y_train_vector['force_x'])

In [None]:
print(gpr2.kernel_)
print(gpr2.log_marginal_likelihood(gpr2.kernel_.theta))

In [None]:
y_pr = gpr2.predict(X_train_vector)
plt.plot(X_train_vector['displacement'], y_pr)
plt.plot(X_train_vector['displacement'], y_train_vector['force_x'] , '.')
plt.show()