In [None]:
import pandas as pd
import numpy as np
import pickle
import sys

from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

from scipy.integrate import simps
from scipy.stats import norm
from scipy.stats import rankdata
from scipy.stats import kendalltau

from datetime import datetime

import random
import math

from multiprocessing.pool import Pool

In [None]:
# Reload dataframe from disk, if it exists

try:
    pickle_out_extended_dataframe = open('extended_dataframe.pickle',"rb")
    data = pickle.load(pickle_out_extended_dataframe)
    pickle_out_extended_dataframe.close()
except IOError:
    print("No previous dataframe is available")

# computing the year-week feature
data['year_week'] = data.datetime.dt.strftime('%Y-%U')

# For each sensor, determine the indexes to partition its data into training and test
# In doing that, keep into account the temporal relationships
# Meaning, test data is in the future wrt training data

perc_training = 0.8


split_type = 'week_based_random' # temporal_wise, day_based_random, full_random

    
# Checking that every sensor has the same length
counts_per_sensor = data[['node', 'datetime']].groupby('node').count()
assert np.max(counts_per_sensor['datetime']) == np.min(counts_per_sensor['datetime'])
sensor_data_length = counts_per_sensor['datetime'][0]    

   
# Maps that store the training and test indexes for each sensor
sens_map_train_indexes = {}
sens_map_test_indexes = {}

for sens in list(counts_per_sensor.index):
    sens_map_train_indexes[sens] = []
    sens_map_test_indexes[sens] = []    
    
    
    
if split_type == 'temporal_wise':
    # first 70% of each sensor data is traning, and next 30% is test

    # Generate training and test indexes

    # I have already sorted the values by node and datetime in the original dataframe
    for i, sens in enumerate(list(counts_per_sensor.index)):
        base_sens = i*sensor_data_length
        last_train = base_sens + int(sensor_data_length*perc_training)
        last_test = last_train + int(sensor_data_length*(1 - perc_training)) + 1
        train_inds = list(range(base_sens, last_train))
        test_inds = list(range(last_train, last_test))
        sens_map_train_indexes[sens] = train_inds
        sens_map_test_indexes[sens] = test_inds
        
    for sens in list(counts_per_sensor.index):
        first_train = sens_map_train_indexes[sens][0]
        last_train = sens_map_train_indexes[sens][-1]
        first_test = sens_map_test_indexes[sens][0]
        last_test = sens_map_test_indexes[sens][-1]
        assert first_train == data.query("node == @sens").reset_index().iloc[0,0]
        assert last_test == data.query("node == @sens").reset_index().iloc[-1,0]
        assert last_train > first_train and last_train < first_test
        assert first_test > last_train and first_test < last_test
    
elif split_type == 'full_random':
    # divide randomly the data of the sensors
    
    possible_indexes = list(range(sensor_data_length))
    random.seed(42)
    # I make sure to use the same split for each sensor
    # meaning, corresponding training and test indexes of the different sensors refer to the same datetimes
    train_indexes_overall = sorted(random.sample(possible_indexes, int(sensor_data_length * perc_training)))
    test_indexes_overall = sorted(list(set(possible_indexes) - set(train_indexes_overall)))
    
    for i, sens in enumerate(list(counts_per_sensor.index)):
        base_sens = i * sensor_data_length
        # it is sufficent to add the baseline, since all sensor tracks have the same length
        sens_map_train_indexes[sens] = np.asarray(train_indexes_overall) + base_sens
        sens_map_test_indexes[sens] = np.asarray(test_indexes_overall) + base_sens

    
    assert np.array_equal(sens_map_train_indexes['raspihat01'], sens_map_train_indexes['raspihat03'] - sensor_data_length*2)
    assert np.array_equal(sens_map_test_indexes['raspihat01'], sens_map_test_indexes['raspihat03'] - sensor_data_length*2)
    assert len(set(sens_map_train_indexes['raspihat01']).intersection(sens_map_test_indexes['raspihat01'])) == 0
    assert len(set(sens_map_train_indexes['raspihat01']).union(sens_map_test_indexes['raspihat01'])) == len(possible_indexes)
    # The training and test indexes are different for each sensor, and refer to the original dataframe

elif split_type == 'day_based_random':
    
    np.random.seed(42)
    days_list = data.only_date.unique().tolist()
    np.random.shuffle(days_list)
    random_train_days = sorted(days_list[0:int(len(days_list)*perc_training)])
    random_test_days =  sorted(days_list[int(len(days_list)*perc_training):])

    for i, sens in enumerate(list(counts_per_sensor.index)):
        subdata = data[data['node'] == sens] # here the indexes are still the same as in the original frame
        sens_map_train_indexes[sens] = subdata[subdata.only_date.isin(random_train_days)].index.tolist().copy()
        sens_map_test_indexes[sens] = subdata[subdata.only_date.isin(random_test_days)].index.tolist().copy()        
    
    assert np.array_equal(sens_map_train_indexes['raspihat01'], sens_map_train_indexes['raspihat03'] - sensor_data_length*2)
    assert np.array_equal(sens_map_test_indexes['raspihat01'], sens_map_test_indexes['raspihat03'] - sensor_data_length*2)
    assert len(set(sens_map_train_indexes['raspihat01']).intersection(sens_map_test_indexes['raspihat01'])) == 0
    assert len(set(sens_map_train_indexes['raspihat01']).union(sens_map_test_indexes['raspihat01'])) == len(data[data['node'] == 'raspihat01'])
    # The training and test indexes are different for each sensor, and refer to the original dataframe

elif split_type == 'week_based_random':
    
    np.random.seed(42)
    weeks_list = data.year_week.unique().tolist()
    np.random.shuffle(weeks_list)
    random_train_weeks = sorted(weeks_list[0:int(len(weeks_list)*perc_training)])
    random_test_weeks =  sorted(weeks_list[int(len(weeks_list)*perc_training):])

    for i, sens in enumerate(list(counts_per_sensor.index)):
        subdata = data[data['node'] == sens] # here the indexes are still the same as in the original frame
        sens_map_train_indexes[sens] = subdata[subdata.year_week.isin(random_train_weeks)].index.tolist().copy()
        sens_map_test_indexes[sens] = subdata[subdata.year_week.isin(random_test_weeks)].index.tolist().copy()        
    
    assert np.array_equal(sens_map_train_indexes['raspihat01'], sens_map_train_indexes['raspihat03'] - sensor_data_length*2)
    assert np.array_equal(sens_map_test_indexes['raspihat01'], sens_map_test_indexes['raspihat03'] - sensor_data_length*2)
    assert len(set(sens_map_train_indexes['raspihat01']).intersection(sens_map_test_indexes['raspihat01'])) == 0
    assert len(set(sens_map_train_indexes['raspihat01']).union(sens_map_test_indexes['raspihat01'])) == len(data[data['node'] == 'raspihat01'])
    # The training and test indexes are different for each sensor, and refer to the original dataframe

else:
    assert False, 'Unsupported split type'
    

In [None]:
# Some general information

window_coords = [(1, 0), (3, 0), (6, 0), (8, 0), (10, 0),
                 (1, 9), (3, 9), (6, 9), (8, 9), (10, 9)]

all_train_idxs = []
for sens in sens_map_train_indexes:
    all_train_idxs.extend(sens_map_train_indexes[sens])

all_test_idxs = []
for sens in sens_map_test_indexes:
    all_test_idxs.extend(sens_map_test_indexes[sens])

node_map_coords = {}
for sens in sens_map_test_indexes:
    coord_x, coord_y = data[data['node'] == sens].iloc[0][['coord_x', 'coord_y']].values
    node_map_coords[sens] = (coord_x, coord_y)
    
all_sensors = np.unique(data['node'])

all_training_data = data.iloc[all_train_idxs].reset_index(drop=True)
all_test_data = data.iloc[all_test_idxs].reset_index(drop=True)

def eucl_dist(x_0, y_0, x_1, y_1):
    return np.sqrt((x_0 - x_1)**2 + (y_0 - y_1)**2)

training_data_ts = len(all_training_data[all_training_data['node'] == 'raspihat01'])

test_data_ts = len(all_test_data[all_test_data['node'] == 'raspihat01'])

air_tube_coords = [(x, 4) for x in list(range(11))]

def array_rep(arr, n):
    assert len(arr.shape) <= 2
    if len(arr.shape) < 2:
        return np.tile(arr, n)
    else:
        retarr = np.full((arr.shape[0]*n, arr.shape[1]), -1.)
        for i in range(n):
            retarr[i*arr.shape[0]: (i+1)*arr.shape[0]] = arr
        return retarr

In [None]:
# Dataframe that collects all the results

results = pd.DataFrame(columns=['method', 'fold', 'median_error', '95_perc_error'])

In [None]:
###### Weighted distance utils functions

## Spatial distances 
center = (5,5)


import numpy as np
import math

## Spatial distances 

# Angular distance between two point
def angular_distance(p1,p2):
    # determines the angle between two points (x_0, y_0) (x_1, y_1)
    # if p2 is the origin (0, 0), then I get the angle wrt x axis
    
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])

    ang1 = np.arctan2(p1[:,1], p1[:,0])
    ang2 = np.arctan2(p2[:,1], p2[:,0])
    
    return np.minimum(np.rad2deg(np.mod(ang1 - ang2, 2 * np.pi)), np.rad2deg(np.mod(ang1 - ang2, 2 * np.pi)))
    
# Euclidean distance between two point
def euclidean_distance(p1,p2):    
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])
    return np.sqrt(np.add(np.subtract(p1[:,0],p2[:,0])**2, np.subtract(p1[:,1],p2[:,1])**2))

# Manhattan distance between two point
def manhattan_distance(p1,p2):
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])    
    return np.abs(p1[:, 0] - p2[:,0]) + np.abs(p1[:,1] - p2[:,1])

# Chebyshev distance between two point
def chebyshev_distance(p1,p2):
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])
    return np.maximum(np.abs(p1[:, 0] - p2[:,0]), np.abs(p1[:,1] - p2[:,1]))

# Ditance from the nearest window
def min_window_distance(p):
    window_coords = [[1., 0.], [3., 0.], [6., 0.], [8., 0.], [10., 0.],
                     [1., 9.], [3., 9.], [6., 9.], [8., 9.], [10., 9.]]
    if len(np.array(p).shape)==1:
        p=np.array([p])
    return np.min([euclidean_distance(p, np.array([w])) for w in window_coords], axis=0)

# Center distance
def center_distance(p):
    center = [5.,5.]
    if len(np.array(p).shape)==1:
        p=np.array([p])
    return euclidean_distance(np.array([center]), p)

# Window distance between two point
def window_dist_similarity(p1,p2):
    window_coords = [[1., 0.], [3., 0.], [6., 0.], [8., 0.], [10., 0.],
                     [1., 9.], [3., 9.], [6., 9.], [8., 9.], [10., 9.]]
    
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])

    d1 = np.min([euclidean_distance(p1, np.array([w])) for w in window_coords], axis=0)
    d2 = np.min([euclidean_distance(p2, np.array([w])) for w in window_coords], axis=0)
    return np.abs(d1 - d2)

# Center distance between two point
def center_dist_similarity(p1,p2):
    center = [5.,5.]
    
    if len(np.array(p1).shape)==1:
        p1=np.array([p1])
    if len(np.array(p2).shape)==1:
        p2=np.array([p2])    
    
    d1 = euclidean_distance(np.array([center]), p1)
    d2 = euclidean_distance(np.array([center]), p2)
    return np.abs(d1 - d2)


# Given all distances estimate the weighted distance 
def weighted_distance(p1, p2):    
    ## dists=[ 'angular','euclidean', 'manhattan', 'c11hebyshev', 'window', 'center'] 
    
    if len(np.array(p1).shape)==2:
        p1 = np.array([[a,b] for (a,b) in p1])
    if len(np.array(p2).shape)==2:
        p2 = np.array([[a,b] for (a,b) in p2])
    
    dists=np.array([angular_distance(p1,p2), euclidean_distance(p1,p2), manhattan_distance(p1,p2),
           chebyshev_distance(p1,p2), window_dist_similarity(p1,p2), center_dist_similarity(p1,p2)])
    
    rank_weights = [0.008478247652264459, 1.3935710976695117, 0.39760921865944, 
                    0.8009320148032265, 3.7056463223028255, -0.15417942722471556]
    
    res=np.dot(dists.T,rank_weights)
    
    if len(res) == 1:
        return res[0]
    
    return res


def my_sqrt(arg1):
    return np.nan_to_num(np.sqrt(arg1))
def my_log(arg1):
    return np.nan_to_num(np.log(arg1))
def my_abs(arg1):
    return np.nan_to_num(np.abs(arg1))
def my_neg(arg1):
    return np.nan_to_num(np.negative(arg1))
def my_square(arg1):
    return np.nan_to_num(np.square(arg1))
def my_add(arg1, arg2):
    return np.nan_to_num(np.add(arg1, arg2))
def my_sub(arg1, arg2):
    return np.nan_to_num(np.subtract(arg1, arg2))
def my_mul(arg1, arg2):
    return np.nan_to_num(np.multiply(arg1, arg2))
def my_div(arg1, arg2):   
    return np.nan_to_num(np.divide(arg1, arg2))
def my_pow(arg1, arg2):
    return np.nan_to_num(np.power(arg1, arg2))
def my_max(arg1, arg2):
    return np.nan_to_num(np.maximum(arg1, arg2), nan=sys.float_info.min)
def my_min(arg1, arg2):
    return np.nan_to_num(np.minimum(arg1, arg2), nan=sys.float_info.max)

def gp_func(ARG0,ARG1,ARG2,ARG3):
    return my_max(my_add(my_div(my_max(ARG3, my_log(ARG1)), my_sub(my_sqrt(0.28819287526174175), my_add(-0.8372387615502006, ARG1))), my_max(0.4761407317066777, ARG3)), my_max(-0.3152168742592161, my_div(0.8792291653581332, ARG0)))

# Given all distances estimate the weighted distance 
def gp_distance(p1, p2):    

    ## dists=[ 'angular','euclidean', 'manhattan', 'c11hebyshev', 'window', 'center'] 
    
    if len(np.array(p1).shape)==2:
        p1 = np.array([[a,b] for (a,b) in p1])
    if len(np.array(p2).shape)==2:
        p2 = np.array([[a,b] for (a,b) in p2])
    
    res = gp_func(angular_distance(p1,p2), euclidean_distance(p1,p2), manhattan_distance(p1,p2),
           chebyshev_distance(p1,p2))
    
    if len(res) == 1:
        return res[0]
    
    return res

In [None]:
# Baseline methods: IDW and classical average
# Results are evaluated using leave-one-out cross-validation over 9 sensors, considering the test set
# Also, I consider K-NN when evaluating those two methods (from 1 to 11)
# Thus, evaluation is on 9 sensors (12 - the 3 refs sensors to be omogeneous wrt machine learning methods evaluation), while the knn can span all sensors (also the 3 ref sensors)

ref_sensors_here = ['raspihat01','raspihat04','raspihat08','raspihat09']
eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))

to_delete = [x for x in results['method'] if 'average' in x]
results = results[np.logical_not(results['method'].isin(to_delete))]
results.reset_index(drop=True, inplace=True)

# for each value of K-NN
pbar = tqdm(total=len(all_sensors)-1, desc="KNNS done")
for knns in range(1, len(all_sensors)):
    # for each evaluation fold
    for sensor in eval_sensors:
        # Getting landmark sensors
        landmark_sensors = sorted(list(set(all_sensors) - set([sensor])))
        # Getting info on the sensor to predict
        X_coord, Y_coord = node_map_coords[sensor]
        ground_truth = all_test_data[all_test_data['node'] == sensor]['temperature'].values
        # Getting the K closest sensors
        distances = []
        for other_sensor in landmark_sensors:
            other_X_coord, other_Y_coord = node_map_coords[other_sensor]
            distances.append(euclidean_distance((X_coord, Y_coord), (other_X_coord, other_Y_coord))[0])
        sorted_by_dist = [x for _,x in sorted(zip(distances, landmark_sensors))]
        knn_sensors = sorted_by_dist[:knns]
        knn_idws = 1 / np.asarray(sorted(distances)[:knns]) 
        # Generating the predictor columns
        predictors = np.full((test_data_ts, len(knn_sensors)), -1.)
        for col_idx, other_sensor in enumerate(knn_sensors):
            predictors[:, col_idx] = all_test_data[all_test_data['node'] == other_sensor]['temperature'].values
        
        # Serve per plottare in ordine sul boxplot
        stradd = ""
        if knns > 9:
            stradd = "x"
        
        # Performing the predictions
        classic_avg_preds = np.mean(predictors, axis=1)
        classic_avg_errors = np.abs(ground_truth - classic_avg_preds)
        classic_avg_median_error = np.percentile(classic_avg_errors, 50)
        classic_avg_95_error = np.percentile(classic_avg_errors, 95)
        results.loc[len(results)] = ['classical_average_KNN-' + stradd + str(knns), sensor, classic_avg_median_error, classic_avg_95_error]
        
        
        idw_avg_preds = np.average(predictors, axis=1, weights=knn_idws)
        idw_avg_errors = np.abs(ground_truth - idw_avg_preds)
        idw_avg_median_error = np.percentile(idw_avg_errors, 50)
        idw_avg_95_error = np.percentile(idw_avg_errors, 95)
        results.loc[len(results)] = ['IDW_average_KNN-' + stradd + str(knns), sensor, idw_avg_median_error, idw_avg_95_error]
 
    pbar.update(1)   
pbar.close()

results.sort_values(by=['method', 'fold'], inplace=True)

pickle_out = open('results.pickle',"wb")
pickle.dump(results, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()

In [None]:
def plot_results():
    
    pickle_out = open('results.pickle',"rb")
    results = pickle.load(pickle_out)
    pickle_out.close()
    
#     skip_methods=['classical', 'IDW', 'particle', 'no_sensor7', 'whole_trainset']
#     skip_methods = ['average','whole_trainset', 'no_sensor7', 'particle']
    skip_methods = []

    sel_methods = [x for x in np.unique(results['method']) if not any(y in x for y in skip_methods)]
    
    plt.figure(figsize=(15, 5))
    res_list = []
    for method in sel_methods:
        res_list.append(results[results['method'] == method]['median_error'].values)
    plt.boxplot(res_list)
    plt.title("Median error per method")
    plt.ylabel("Error (temperature)")
    plt.xlabel("Method")
    plt.xticks(np.asarray(list(range(len(sel_methods)))) + 1, sel_methods, rotation=90)
    plt.show()

    plt.figure(figsize=(15, 5))
    res_list = []
    for method in sel_methods:
        res_list.append(results[results['method'] == method]['95_perc_error'].values)
    plt.boxplot(res_list)
    plt.title("95th percentile error per method")
    plt.ylabel("Error (temperature)")
    plt.xlabel("Method")
    plt.xticks(np.asarray(list(range(len(sel_methods)))) + 1, sel_methods, rotation=90)
               
    plt.show()

plot_results()

In [None]:
### Particle filter implementation


# Euclidean distance between two point
def euclidean_distance_pf(p1,p2):
    return math.sqrt(sum([(a - b) ** 2 for a, b in zip(p1, p2)]))

# Window distance between two point
def window_distance_similarity_pf(p1,p2): 
    d1 = min([euclidean_distance_pf(p1, w) for w in window_coords])
    d2 = min([euclidean_distance_pf(p2, w) for w in window_coords])
    return abs(d1 - d2)


def create_uniform_particles(t_range, N):
    particles = np.random.uniform(t_range[0], t_range[1], N)
    return particles

def get_nearest_landmark(pos, landmarks_pos, landmarks_temp):
    index_min = np.argmin([euclidean_distance(pos, x) for x in landmarks_pos] )
    # print('minindex', index_min, 'list', [ euclidean_distance(pos, x) for x in landmarks_pos]) #debug
    return  landmarks_pos[index_min], landmarks_temp[index_min]
    

# predizione transizione particelle
def predict(particles, u, std): # particles, u = delta ref temp, std = std.dev ref temp    
    particles += u
    particles += np.random.uniform(-1, 1, len(particles)) * std
    
def update_pos(particles, weights, zs, sd, landmarks, pos, landmarks_pos):
    weights.fill(1.)
    for i, landmark in enumerate(landmarks):
        landmark_pos = landmarks_pos[i]
        
        distance = 0.5 * np.abs(particles - landmark)
#         distance += 0.5 * weighted_distance(landmark_pos, pos)
        distance += 0.3 * euclidean_distance_pf(landmark_pos, pos)
        distance += 0.2 * window_distance_similarity_pf(landmark_pos, pos)
        weights *= norm(distance, sd).pdf(zs[i])
    weights += 1.e-300 # for numerical stability
    weights /= np.sum(weights)
    
# def neff(weights):
#     return 1. / np.sum(np.square(weights))

def systematic_resample(weights):
    N = len(weights)
    positions = (np.arange(N) + np.random.randn(N)) / N

    indexes = np.zeros(N, 'i') # 'i' -> integer
    cumulative_sum = np.cumsum(weights)
    i, j = 0, 0
    while i < N and j<N:
        if positions[i] < cumulative_sum[j]:
            indexes[i] = j
            i += 1
        else:
            j += 1
    return indexes

def estimate(particles, weights):
#     var = np.average((particles - mean)**2, weights=weights, axis=0)
    return np.average(particles, weights=weights, axis=0)

def resample_from_index(particles, weights, indexes):
    particles = particles[indexes]
    weights = weights[indexes]
    weights /= np.sum(weights)

def run_iteration_pos(particles, weights, ref_temp, previous_ref_temp, pos, landmarks, landmarks_pos, references_pos, sensor_std_err):
    u=previous_ref_temp-ref_temp # u = delta ref temp
    
    # predict particles values based on ref changes
    predict(particles, u, sensor_std_err) #t_std
    
    # predicted landmarks variantions wrt temp
    zs = 0.5 * np.abs(landmarks - ref_temp) 
#     zs += 0.5 * np.array([weighted_distance(landmark_pos, references_pos) for landmark_pos in landmarks_pos]) 
    
    zs += 0.3 * np.array([euclidean_distance_pf(landmark_pos, references_pos) for landmark_pos in landmarks_pos])
    zs += 0.2 * np.array([window_distance_similarity_pf(landmark_pos, references_pos) for landmark_pos in landmarks_pos]) 
#     zs = zs + (np.random.randn(NL) * (sensor_std_err))
#     print('update_zs time:', time.time() - elapsed_time)
#     elapsed_time = time.time()
    
    update_pos(particles, weights, zs=zs, sd=sensor_std_err*0.5, landmarks=landmarks, pos=pos, landmarks_pos=landmarks_pos)
#     print('update_particle time:', time.time() - elapsed_time)
#     elapsed_time = time.time()

    val=estimate(particles, weights) # weighted mean

    indexes = systematic_resample(weights)
    resample_from_index(particles, weights, indexes)
    
    ### estimate temp value from particles
    return val

def predict_cell_temps(x, y, landmarks, landmarks_pos, t_min, t_max):
    global pbar
    cell_temps = np.full(len(landmarks), -1.0)
    
    _, previous_ref_temp = get_nearest_landmark((x,y), landmarks_pos, landmarks[(6*60*24)-1,:])
    
    for i in range(len(landmarks)):
        if i % (6*60*24) == 0: # reset particles each new day
            particles=create_uniform_particles([t_min,t_max], N)
            weights = np.full(N, 1.)
        
        
        std = np.std(landmarks[i]) if len(landmarks[i])> 1 else 0.001
        
        reference_pos, reference_temp = get_nearest_landmark((x,y), landmarks_pos, landmarks[i,:])

        cell_temps[i] = run_iteration_pos(particles, weights, reference_temp, previous_ref_temp, (x,y),
                            landmarks[i,:], landmarks_pos, reference_pos, std)
#                             test_data.iloc[i]['reference_temperatures_stddev']
#         print(x , y, 'ref_pos', reference_pos, 'ref temp', reference_temp, 'estimated temp', cell_temps[i])

        previous_ref_temp = reference_temp
    
    pbar.update(1)
    
    return cell_temps

In [None]:
to_delete = [x for x in results['method'] if 'particle_filter' in x]
results = results[np.logical_not(results['method'].isin(to_delete))]
results.reset_index(drop=True, inplace=True)

ref_sensors_here = ['raspihat08','raspihat09','raspihat01','raspihat04']
eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))

start_knns=4

# for each value of K-NN
pbar = tqdm(total=(len(all_sensors)-start_knns)*len(eval_sensors), desc="KNNS done")
calls = np.full((len(all_sensors)-start_knns, len(eval_sensors) ), None)
N=400 # n. paticles
pool = Pool(processes=31)

for knns in range(start_knns, len(all_sensors)):
    # for each evaluation fold
    for j,sensor in enumerate(eval_sensors):
        # Getting landmark sensors
        landmark_sensors = sorted(list(set(eval_sensors) - set([sensor])))
        # Getting info on the sensor to predict
        X_coord, Y_coord = node_map_coords[sensor]
        
        # Getting the K closest sensors
        distances = []
        for other_sensor in landmark_sensors:
            other_X_coord, other_Y_coord = node_map_coords[other_sensor]
            distances.append(euclidean_distance([X_coord, Y_coord], [other_X_coord, other_Y_coord]))
        sorted_by_dist = [x for _,x in sorted(zip(distances, landmark_sensors))]
        knn_sensors = sorted_by_dist[:knns]
        
        landmarks_pos = []
        landmarks_temp = np.full((test_data_ts, len(knn_sensors)), -1.)
        for i,os in enumerate(knn_sensors):
            landmarks_pos.append(list(node_map_coords[os]))
            landmarks_temp[:, i] = all_test_data.query("node==@os")['temperature'].values
        
        landmarks_pos=np.array(landmarks_pos) 
        
        t_min = all_training_data[all_training_data.node.isin(knn_sensors)]['temperature'].min()
        t_max = all_training_data[all_training_data.node.isin(knn_sensors)]['temperature'].max()
#         print(t_min, t_max)
        calls[knns-start_knns,j] = pool.apply_async(predict_cell_temps, (X_coord, Y_coord, landmarks_temp.copy(), landmarks_pos.copy(), t_min, t_max))
                                 
        
## read the results of the async calls

# for each value of K-NN

for knns in range(start_knns, len(all_sensors)):
    for j,sensor in enumerate(eval_sensors):
        ground_truth = all_test_data[all_test_data['node'] == sensor]['temperature'].values
        predictors = calls[knns-start_knns, j].get()
        
        stradd = ""
        if knns > 9:
            stradd = "x"
        
        particle_errors = np.abs(ground_truth - predictors)
        particle_median_error = np.percentile(particle_errors, 50)
        particle_95_error = np.percentile(particle_errors, 95)
        results.loc[len(results)] = ['particle_filter_KNN-' + stradd + str(knns), sensor, particle_median_error, particle_95_error]
                
pbar.close()

results.sort_values(by=['method', 'fold'], inplace=True)

pickle_out = open('results.pickle',"wb")
pickle.dump(results, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()

In [None]:
# Feature selection: 
# - the first step is removing highly correlated columns
# - then, on the remainder, we run XGBoost
#
# Considered features
# - MOY sin and cos
# - DOW sin and cos
# - SECS sin and cos
# - 3 ref temperatures
# - 3 distances from the reference sensors
# - 3 mutual window distances
# - 3 mutual center distances
# - 3 mutual air tube distances
# - min window distance
# - min center distance
# - min air tube distance
# - X and Y coords of the cell to predict

ref_sensors_here = ['raspihat01','raspihat04','raspihat08','raspihat09']

features = ['moy_sin', 'moy_cos', 'dow_sin', 'dow_cos', 
            'seconds_from_midnight_sin', 'seconds_from_midnight_cos',
            '01_ref_temp', '04_ref_temp', '08_ref_temp','09_ref_temp',
            'X_coord', 'Y_coord',
#             '01_ref_angulardist', '08_ref_angulardist','02_ref_angulardist', '04_ref_angulardist',
#             '01_ref_euclideandist', '08_ref_euclideandist','02_ref_euclideandist', '04_ref_euclideandist',
#             '01_ref_manhattandist', '08_ref_manhattandist','02_ref_manhattandist', '04_ref_manhattandist',
#             '01_ref_chebyshevdist', '08_ref_chebyshevdist','02_ref_chebyshevdist', '04_ref_chebyshevdist',
            '01_ref_gpdist', '04_ref_gpdist', '08_ref_gpdist','09_ref_gpdist']

# Generating the dataset

eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))
dataframe_training = all_training_data[all_training_data['node'].isin(eval_sensors)]

X_data_training = np.full((len(dataframe_training), 16), -111.)
# Temporal information
X_data_training[:, 0] = dataframe_training['moy_sin']
X_data_training[:, 1] = dataframe_training['moy_cos']
X_data_training[:, 2] = dataframe_training['dow_sin']
X_data_training[:, 3] = dataframe_training['dow_cos']
X_data_training[:, 4] = dataframe_training['seconds_from_midnight_sin']
X_data_training[:, 5] = dataframe_training['seconds_from_midnight_cos']

# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    X_data_training[:, i+6] = array_rep(all_training_data[all_training_data['node'] == ref_sensor]['temperature'], len(eval_sensors))
    
# X and Y coords
X_data_training[:, 10] = dataframe_training['coord_x']
X_data_training[:, 11] = dataframe_training['coord_y']


# # angular distances
# for i, ref_sensor in enumerate(ref_sensors_here):
#     X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
#     ref_points = [(X_coord_ref, Y_coord_ref)]
#     X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
#     sens_points = list(zip(X_coord,Y_coord))
#     X_data_training[:, 12+i] = angular_distance(np.array(ref_points), np.array(sens_points))

    
# # euclidean distances
# for i, ref_sensor in enumerate(ref_sensors_here):
#     X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
#     ref_points = [(X_coord_ref, Y_coord_ref)]
#     X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
#     sens_points = list(zip(X_coord,Y_coord))
#     X_data_training[:, 16+i] = euclidean_distance(np.array(ref_points), np.array(sens_points))
    
    
# # manhattan distances
# for i, ref_sensor in enumerate(ref_sensors_here):
#     X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
#     ref_points = [(X_coord_ref, Y_coord_ref)]
#     X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
#     sens_points = list(zip(X_coord,Y_coord))
#     X_data_training[:, 20+i] = manhattan_distance(np.array(ref_points), np.array(sens_points))
    
# # chebyshev distances
# for i, ref_sensor in enumerate(ref_sensors_here):
#     X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
#     ref_points = [(X_coord_ref, Y_coord_ref)]
#     X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
#     sens_points = list(zip(X_coord,Y_coord))
#     X_data_training[:, 24+i] = chebyshev_distance(np.array(ref_points), np.array(sens_points))
        
# genetic distances
for i, ref_sensor in enumerate(ref_sensors_here):
    X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
    ref_points = [(X_coord_ref, Y_coord_ref)]
    X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
    sens_points = list(zip(X_coord,Y_coord))
    X_data_training[:, 12+i] = gp_distance(ref_points, sens_points)


assert np.min(X_data_training) > -111.

# Normalizing predictors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_data_training = scaler.fit_transform(X_data_training)

In [None]:
# Evaluating correlations
features = ['moy_sin', 'moy_cos', 'dow_sin', 'dow_cos', 
            'seconds_from_midnight_sin', 'seconds_from_midnight_cos',
            '01_ref_temp', '04_ref_temp', '08_ref_temp','09_ref_temp',
            'X_coord', 'Y_coord',
#             '01_ref_angulardist', '08_ref_angulardist','02_ref_angulardist', '04_ref_angulardist',
#             '01_ref_euclideandist', '08_ref_euclideandist','02_ref_euclideandist', '04_ref_euclideandist',
#             '01_ref_manhattandist', '08_ref_manhattandist','02_ref_manhattandist', '04_ref_manhattandist',
#             '01_ref_chebyshevdist', '08_ref_chebyshevdist','02_ref_chebyshevdist', '04_ref_chebyshevdist',
            '01_ref_gpdist', '04_ref_gpdist', '08_ref_gpdist','09_ref_gpdist']

corr_matrix = pd.DataFrame(np.corrcoef(X_data_training, rowvar=False), columns=features, index=features)
corr_matrix = corr_matrix.abs()

list_corrs = pd.DataFrame(corr_matrix.unstack().sort_values(ascending=False)).reset_index()
list_corrs.columns = ['feat_1', 'feat_2', 'corr']
list_corrs = list_corrs[list_corrs['feat_1'] < list_corrs['feat_2']]

print(list_corrs[0:20])


break_corrs = []
   
            
break_corrs_idx = []
for col in break_corrs:
    break_corrs_idx.append(features.index(col))
    
features_no_corr = [x for x in features if x not in break_corrs]

In [None]:
X_data_training_no_corr = X_data_training[:, [x for x in range(len(features)) if x not in break_corrs_idx ]]

corr_matrix = pd.DataFrame(np.corrcoef(X_data_training_no_corr, rowvar=False), columns=features_no_corr, index=features_no_corr)
corr_matrix = corr_matrix.abs()

list_corrs = pd.DataFrame(corr_matrix.unstack().sort_values(ascending=False)).reset_index()
list_corrs.columns = ['feat_1', 'feat_2', 'corr']
list_corrs = list_corrs[list_corrs['feat_1'] < list_corrs['feat_2']]

print(list_corrs[0:20])


In [None]:
# Feature selection with SHAP and XGBoost, in full-training over 9 sensors (all - ref)
# I predict just the single temperature value
# https://medium.com/@lucasramos_34338/visualizing-variable-importance-using-shap-and-cross-validation-bd5075e9063a


# Calculate the SHAP values
shap_vals = []
shap_insts = []

from xgboost import XGBRegressor
import shap
reg = XGBRegressor(n_jobs=31, random_state=42).fit(X_data_training_no_corr, dataframe_training['temperature'].values) #n_estimators=100 (default)

subsampled = shap.sample(X_data_training_no_corr, 1500, random_state=42)
explainer = shap.TreeExplainer(reg, subsampled)
shap_values = explainer.shap_values(subsampled)
shap_vals.extend(shap_values)
shap_insts.extend(subsampled)

pickle_out = open('shap_vals.pickle',"wb")
pickle.dump(shap_vals, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()

pickle_out = open('shap_insts.pickle',"wb")
pickle.dump(shap_insts, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close() 

In [None]:
import shap

pickle_out = open('shap_vals.pickle',"rb")
shap_vals = np.asarray(pickle.load(pickle_out))
pickle_out.close()  

pickle_out = open('shap_insts.pickle',"rb")
shap_insts = np.asarray(pickle.load(pickle_out))
pickle_out.close()

shap.summary_plot(shap_vals, shap_insts, feature_names=features_no_corr, show=False)
plt.savefig("shap.pdf", format='pdf', dpi=600, bbox_inches='tight')

In [None]:
len(features_no_corr)

In [None]:
# Linear Regression (reference sensors and some other data)
# EVALUATION

to_delete = [x for x in results['method'] if 'linreg' in x]
results = results[np.logical_not(results['method'].isin(to_delete))]
results.reset_index(drop=True, inplace=True)


# selected features
# ['moy_sin', 'moy_cos', 'dow_sin', 'dow_cos', 'seconds_from_midnight_sin', 'seconds_from_midnight_cos', '02_ref_temp', '10_ref_temp', '05_ref_temp', '02_ref_dist', '10_ref_dist', '05_ref_dist', '02_mutual_center', '10_mutual_center', '05_mutual_center', 'min_window_dist', 'X_coord', 'Y_coord'


# Based on the selected features, I now try LinearRegression
# The evaluation is performed by means of leave-one-out cross-validation
# I predict just the single temperature value

#['raspihat02', 'raspihat10', 'raspihat11'] 1.4782938271044614
#['raspihat02', 'raspihat10', 'raspihat05'] 1.3759392664427377

ref_sensors_here = ['raspihat08','raspihat09','raspihat01','raspihat04']

eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))

# Data that is common for each fold (training)
common_data_training = np.full((training_data_ts, 10), -111.)
# Temporal information
common_data_training[:, 0] = all_training_data[all_training_data['node'] == 'raspihat01']['moy_sin']
common_data_training[:, 1] = all_training_data[all_training_data['node'] == 'raspihat01']['moy_cos']
common_data_training[:, 2] = all_training_data[all_training_data['node'] == 'raspihat01']['dow_sin']
common_data_training[:, 3] = all_training_data[all_training_data['node'] == 'raspihat01']['dow_cos']
common_data_training[:, 4] = all_training_data[all_training_data['node'] == 'raspihat01']['seconds_from_midnight_sin']
common_data_training[:, 5] = all_training_data[all_training_data['node'] == 'raspihat01']['seconds_from_midnight_cos']
# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    common_data_training[:, i+6] = all_training_data[all_training_data['node'] == ref_sensor]['temperature']


# Data that is common for each fold (test)
common_data_test = np.full((test_data_ts, 10), -111.)
# Temporal information
common_data_test[:, 0] = all_test_data[all_test_data['node'] == 'raspihat01']['moy_sin']
common_data_test[:, 1] = all_test_data[all_test_data['node'] == 'raspihat01']['moy_cos']
common_data_test[:, 2] = all_test_data[all_test_data['node'] == 'raspihat01']['dow_sin']
common_data_test[:, 3] = all_test_data[all_test_data['node'] == 'raspihat01']['dow_cos']
common_data_test[:, 4] = all_test_data[all_test_data['node'] == 'raspihat01']['seconds_from_midnight_sin']
common_data_test[:, 5] = all_test_data[all_test_data['node'] == 'raspihat01']['seconds_from_midnight_cos']
# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    common_data_test[:, i+6] = all_test_data[all_test_data['node'] == ref_sensor]['temperature']

fold_sensors = sorted(eval_sensors)
fold_data_training = all_training_data[all_training_data['node'].isin(fold_sensors)]

X_train_data_fold = np.full((training_data_ts*len(fold_sensors), 16), -111.)
    
# Copy the common data
X_train_data_fold[:, 0:10] = array_rep(common_data_training, len(fold_sensors))

# X, Y coord
X_train_data_fold[:, 10] = fold_data_training['coord_x']
X_train_data_fold[:, 11] = fold_data_training['coord_y']


# Weighted distance
for i, ref_sensor in enumerate(ref_sensors_here[0:4]):
    X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
    ref_points = [(X_coord_ref, Y_coord_ref)]
    X_coord, Y_coord = fold_data_training['coord_x'].values, fold_data_training['coord_y'].values
    sens_points = list(zip(X_coord,Y_coord))
    X_train_data_fold[:, 12+i] = gp_distance(ref_points, sens_points)


assert np.min(X_train_data_fold) > -111.

# Normalizing predictors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_data_fold=scaler.fit_transform(X_train_data_fold)

# Train the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression(n_jobs=31).fit(X_train_data_fold, fold_data_training['temperature'].values)


# Now I perform the K-fold cross-validation
pbar = tqdm(total=len(eval_sensors), desc="FOLDS done")
for sens_index, sensor in enumerate(eval_sensors):
    fold_data_test = all_test_data[all_test_data['node'] == sensor]
    
    X_test_data_fold = np.full((test_data_ts, 16), -111.)
    
    # Copy the common data
    X_test_data_fold[:, 0:10] = common_data_test
    
    # X, Y coord
    X_test_data_fold[:, 10] = fold_data_test['coord_x']   
    X_test_data_fold[:, 11] = fold_data_test['coord_y']

    for i, ref_sensor in enumerate(ref_sensors_here[0:4]):
        X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
        ref_points = [(X_coord_ref, Y_coord_ref)]
        X_coord, Y_coord = fold_data_test['coord_x'].values, fold_data_test['coord_y'].values
        sens_points = list(zip(X_coord,Y_coord))
        X_test_data_fold[:, 12+i] = gp_distance(ref_points, sens_points)    
    
    assert np.min(X_test_data_fold) > -111.

    # Normalizing predictors
    X_test_data_fold=scaler.transform(X_test_data_fold)
    
    
    # Predict the values, and evaluate the errors
    fold_preds = reg.predict(X_test_data_fold) 
    
    errors = abs(fold_data_test['temperature'].values - fold_preds)
    
    median_error = np.percentile(errors, 50)
    p95_error = np.percentile(errors, 95)
    results.loc[len(results)] = ['linreg', sensor, median_error, p95_error]
    
    pbar.update(1)
    
pbar.close()

results.sort_values(by=['method', 'fold'], inplace=True)

pickle_out = open('results.pickle',"wb")
pickle.dump(results, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()

In [None]:
print(reg.coef_, reg.intercept_)

In [None]:
plot_results()
display(results.groupby('method')[['method', '95_perc_error']].median().sort_values(by=['95_perc_error']))

In [None]:
# XGBoost Tuning, performed on fixed training-test split, since cross-validation would be too costly

#### XGBoost: TUNING
# Tuning is performed on a single training/validation split, since leave-one-out cross-validation
# would be too costly

# Generating the dataset

ref_sensors_here = ['raspihat08','raspihat09','raspihat01','raspihat04']

eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))
dataframe_training = all_training_data[all_training_data['node'].isin(eval_sensors)]

X_data = np.full((len(dataframe_training), 11), -111.)
# Temporal information
X_data[:, 0] = dataframe_training['moy_sin']
# X_data[:, 1] = dataframe_training['moy_cos']
X_data[:, 1] = dataframe_training['dow_sin']
# X_data[:, 3] = dataframe_training['dow_cos']
X_data[:, 2] = dataframe_training['seconds_from_midnight_sin']
X_data[:, 3] = dataframe_training['seconds_from_midnight_cos']

# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    X_data[:, i+4] = array_rep(all_training_data[all_training_data['node'] == ref_sensor]['temperature'], len(eval_sensors))

X_data[:, 8] = dataframe_training['coord_x']
X_data[:, 9] = dataframe_training['coord_y']

for i, ref_sensor in enumerate(ref_sensors_here[0:1]):
    X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
    ref_points = [(X_coord_ref, Y_coord_ref)]
    X_coord, Y_coord = dataframe_training['coord_x'].values, dataframe_training['coord_y'].values
    sens_points = list(zip(X_coord,Y_coord))
    X_data[:, 10+i] = gp_distance(ref_points, sens_points)

assert np.min(X_data) > -111.

y_data = dataframe_training['temperature'].values

# Splitting into training and eval data (week based splitting)
np.random.seed(42)
weeks_list = dataframe_training.year_week.unique().tolist()
np.random.shuffle(weeks_list)
train_weeks = sorted(weeks_list[0:int(len(weeks_list)*perc_training)])

# Different weeks between training and validation...
X_data_training = X_data[dataframe_training['year_week'].isin(train_weeks)]
y_data_training = y_data[dataframe_training['year_week'].isin(train_weeks)]

X_data_validation = X_data[~ dataframe_training['year_week'].isin(train_weeks)]
y_data_validation = y_data[~ dataframe_training['year_week'].isin(train_weeks)]

print(X_data_training.shape, X_data_validation.shape)

# Normalizing predictors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_data_training=scaler.fit_transform(X_data_training)
X_data_validation=scaler.transform(X_data_validation)

# Tuning with hyperopt

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor 


def hyperopt_train_test(params):
    model = XGBRegressor(n_jobs=31, random_state=42, **params).fit(X_data_training, y_data_training)
    predictions = model.predict(X_data_validation)
    errors = np.abs(y_data_validation - predictions)
    return np.percentile(errors, 95)


search_space = {
    'max_depth': hp.choice('max_depth', range(1, 200, 5)),
    'learning_rate': hp.choice('learning_rate', [0.005, 0.0075, 0.01, 0.0125, 0.015, 0.0175]),
    'n_estimators': hp.choice('n_estimators', range(250, 700, 50)),
    'reg_alpha' : hp.uniform('reg_alpha', 0, 100),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
    'gamma': hp.uniform ('gamma', 0, 9),
    'subsample' : hp.uniform('subsample', 0.6, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.6, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}


tuning_history = []

best = 9999999
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc < best:
        best = acc
        print('new best:', best, params)
        tuning_history.append((best, params))
        
        pickle_file = open('tuning_history.pickle',"wb")
        pickle.dump(tuning_history, pickle_file)
        pickle_file.close()    
        
    return {'loss': acc, 'status': STATUS_OK} 

trials = Trials()
best = fmin(f, search_space, algo=tpe.suggest, max_evals=40, trials=trials) # minimizza lo score
print('best:', best)

In [None]:
#### XGBoost: EVALUATION

ref_sensors_here = ['raspihat08','raspihat09','raspihat01','raspihat04']

to_delete = [x for x in results['method'] if 'xgboost' in x]
results = results[np.logical_not(results['method'].isin(to_delete))]
results.reset_index(drop=True, inplace=True)


eval_sensors = sorted(list(set(all_sensors) - set(ref_sensors_here)))


# Data that is common for each fold (training)
common_data_training = np.full((training_data_ts, 8), -111.)
# Temporal information
common_data_training[:, 0] = all_training_data[all_training_data['node'] == 'raspihat01']['moy_sin']
# common_data_training[:, 1] = all_training_data[all_training_data['node'] == 'raspihat01']['moy_cos']
common_data_training[:, 1] = all_training_data[all_training_data['node'] == 'raspihat01']['dow_sin']
# common_data_training[:, 3] = all_training_data[all_training_data['node'] == 'raspihat01']['dow_cos']
common_data_training[:, 2] = all_training_data[all_training_data['node'] == 'raspihat01']['seconds_from_midnight_sin']
common_data_training[:, 3] = all_training_data[all_training_data['node'] == 'raspihat01']['seconds_from_midnight_cos']
# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    common_data_training[:, i+4] = all_training_data[all_training_data['node'] == ref_sensor]['temperature']


# Data that is common for each fold (test)
common_data_test = np.full((test_data_ts, 8), -111.)
# Temporal information
common_data_test[:, 0] = all_test_data[all_test_data['node'] == 'raspihat01']['moy_sin']
# common_data_test[:, 1] = all_test_data[all_test_data['node'] == 'raspihat01']['moy_cos']
common_data_test[:, 1] = all_test_data[all_test_data['node'] == 'raspihat01']['dow_sin']
# common_data_test[:, 3] = all_test_data[all_test_data['node'] == 'raspihat01']['dow_cos']
common_data_test[:, 2] = all_test_data[all_test_data['node'] == 'raspihat01']['seconds_from_midnight_sin']
common_data_test[:, 3] = all_test_data[all_test_data['node'] == 'raspihat01']['seconds_from_midnight_cos']
# Reference temperatures
for i, ref_sensor in enumerate(ref_sensors_here):
    common_data_test[:, i+4] = all_test_data[all_test_data['node'] == ref_sensor]['temperature']
    
fold_sensors = sorted(eval_sensors)
fold_data_training = all_training_data[all_training_data['node'].isin(fold_sensors)]

X_train_data_fold = np.full((training_data_ts*len(fold_sensors), 11), -111.)
    
# Copy the common data
X_train_data_fold[:, 0:8] = array_rep(common_data_training, len(fold_sensors))

# X and Y coord ['raspihat01','raspihat04','raspihat08','raspihat09']
X_train_data_fold[:, 8] = fold_data_training['coord_x']
X_train_data_fold[:, 9] = fold_data_training['coord_y']


# Weighted distance
for i, ref_sensor in enumerate(ref_sensors_here[0:1]):
    X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
    ref_points = [(X_coord_ref, Y_coord_ref)]
    X_coord, Y_coord = fold_data_training['coord_x'].values, fold_data_training['coord_y'].values
    sens_points = list(zip(X_coord,Y_coord))
    X_train_data_fold[:, 10+i] = gp_distance(ref_points, sens_points)

assert np.min(X_train_data_fold) > -111.


# Normalizing predictors
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_data_fold=scaler.fit_transform(X_train_data_fold)

# Train the model
from xgboost import XGBRegressor    

reg = XGBRegressor(n_jobs=31, random_state=42).fit(X_train_data_fold, fold_data_training['temperature'].values)
                   
print("Model trained.")

# Now I perform the evaluation on eval sensors
pbar = tqdm(total = len(eval_sensors))

for sens_index, sensor in enumerate(eval_sensors):

    fold_data_test = all_test_data[all_test_data['node'] == sensor]
    X_test_data_fold = np.full((test_data_ts, 11), -111.)
    
    # Copy the common data
    X_test_data_fold[:, 0:8] = common_data_test
    
    # X and Y coord
    X_test_data_fold[:, 8] = fold_data_test['coord_x']
    X_test_data_fold[:, 9] = fold_data_test['coord_y']

    
    # Weighted distance
    for i, ref_sensor in enumerate(ref_sensors_here[0:1]):
        X_coord_ref, Y_coord_ref = node_map_coords[ref_sensor]
        ref_points = [(X_coord_ref, Y_coord_ref)]
        X_coord, Y_coord = fold_data_test['coord_x'].values, fold_data_test['coord_y'].values
        sens_points = list(zip(X_coord,Y_coord))
        X_test_data_fold[:, 10+i] = gp_distance(ref_points, sens_points)
    
    assert np.min(X_test_data_fold) > -111.
    
    # Normalizing predictors
    X_test_data_fold = scaler.transform(X_test_data_fold)
    
    # Predict the values, and evaluate the errors
    fold_preds = reg.predict(X_test_data_fold)
    
    errors = abs(fold_data_test['temperature'].values - fold_preds)
    
    median_error = np.percentile(errors, 50)
    p95_error = np.percentile(errors, 95)
    results.loc[len(results)] = ['xgboost', sensor, median_error, p95_error]
    pbar.update(1)

pbar.close()

results.sort_values(by=['method', 'fold'], inplace=True)

pickle_out = open('results.pickle',"wb")
pickle.dump(results, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()