# 3. Dataset Shift Detection

<div class="alert alert-block alert-info">
    <b>About:</b>
    This notebook refers to the studies presented in <b>Chapter 5.4</b> of the Ph.D. thesis [3].
    We can not guarantee completeness or correctness of the code.
    If you find bugs or if you have suggestions on how to improve the code, we encourage you to post your ideas as <a href="https://github.com/felixriese/alpaca-processing/issues">GitHub issue</a>.
</div>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import pickle
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import cross_validate, train_test_split
import sklearn.metrics as me
from scipy.signal import savgol_filter
import susi

sys.path.append("../aprocessing/")

import utils

## Load data

In [None]:
# CHANGE maximum soilmoisture value
max_sm = 40

# CHANGE areas from ["1", "2_1", "2_2", "3", "4", "5"]
areas = ["1", "2_1", "2_2", "3", "4",  "5"]
areas_stacked = [["1"], ["2_1", "2_2"], ["3"], ["4"], ["5"]]

In [None]:
# load data
df = pd.read_csv("../data/processed/peru_data.csv", index_col=0)

# remove areas which are not used
df = df[df["area"].isin(areas)]

# remove too large soil moisture values
df = df[df["soilmoisture"]<=max_sm]

# define hyperspectral bands
hypbands = utils.getHyperspectralBands(True)
print(hypbands.shape)

# create arrays
X = df[hypbands.astype("str")].values
y = df["soilmoisture"].values

print(df.shape, X.shape, y.shape)

In [None]:
def getClosestBandIndex(wavelength):
    hypbands = utils.getHyperspectralBands(True)
    closest_index = np.abs(hypbands - wavelength).argmin()
    return closest_index

In [None]:
print("Datapoints per area:")
for area in areas:
    print("Area {0:4}:\t{1}".format(area, df[df["area"]==area].shape[0]))

In [None]:
fig, ax_list = plt.subplots(2, 3, figsize=(15, 8))
bins = np.arange(0, max_sm, 5)

for i, area in enumerate(areas):
    ax = ax_list[i//3, i%3]
    ax.set_title("Area "+str(area))
    ax.set_xlim(0, max_sm)
    ax.hist(df[df["area"]==area].soilmoisture.values, bins)
plt.tight_layout()

### SOM

In [None]:
n_rows = 40
n_columns = 40

In [None]:
cmap = plt.cm.viridis
cmaplist = [cmap(i) for i in range(cmap.N)]
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('mcm', cmaplist, cmap.N)

In [None]:
# pickle.dump(model, open("../data/models/som_clustering.p", "wb"))
model = pickle.load(open("../data/models/som_clustering.p", "rb"))

In [None]:
fontsize = 15

steps = 3
bins = [np.arange(0, n_columns, steps), np.arange(0, n_rows, steps)]
fig, ax_list = plt.subplots(2, 3, figsize=(12, 8), sharey=True, sharex=True)
for i, area in enumerate(areas_stacked):
    X_area = df[df["area"].isin(area)][hypbands.astype("str")].values
    clusters = model.get_clusters(X_area)

    ax = ax_list[i//3, i%3]
    img = ax.hist2d([x[1] for x in clusters], [x[0] for x in clusters], bins=bins, cmin=1,
              cmap=cmap, vmin=1, vmax=5)
    ax.set_title("Area A"+str(area[0][0]), fontsize=fontsize)
    ax.set_xlim(0, n_columns-1)
    ax.set_ylim(n_rows, 0)
    
    if i % 3 == 0:
        ax.set_ylabel("SOM rows", fontsize=fontsize)
    if i // 3 == 1:
        ax.set_xlabel("SOM columns", fontsize=fontsize)

    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(fontsize)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(fontsize)
    
    if i == len(areas_stacked)-1:
        cbar = plt.colorbar(img[3], ax=ax_list[-1,-1])
        cbar.ax.tick_params(labelsize=fontsize)
        cbar.ax.set_ylabel('Number of datapoints', fontsize=fontsize, labelpad=10)
fig.delaxes(ax_list[-1,-1])
    
plt.tight_layout()
plt.savefig("../plots/som_hist_all.pdf", bbox_inches="tight")

## Quantitative Detection of Dataset Shift

In [None]:
steps = 3
bins = [np.arange(0, n_columns, steps), np.arange(0, n_rows, steps)]


for i, area in enumerate(areas_stacked):
    X_area = df[df["area"].isin(area)][hypbands.astype("str")].values
    X_notarea = df[~df["area"].isin(area)][hypbands.astype("str")].values
    
    clusters = model.get_clusters(X_area)
    clusters_notarea = model.get_clusters(X_notarea)
    
    grid, _, _ = np.histogram2d([x[1] for x in clusters], [x[0] for x in clusters], bins=bins)
    grid_notarea, _, _ = np.histogram2d([x[1] for x in clusters_notarea],
                                        [x[0] for x in clusters_notarea], bins=bins)

    n_overlap = np.count_nonzero((grid != 0)*(grid_notarea != 0) == True)
    print(n_overlap)
    n_neurons = np.count_nonzero(grid!=0.)

    n_allneurons = grid.shape[0]*grid.shape[1]
    print("Percentage of the SOM grid: {0:.1f} %".format(n_neurons/n_allneurons*100))
    print("Percentage of overlap with others: {0:.1f} %".format(n_overlap/n_neurons*100))
    print()