In [21]:
import numpy as np
import pandas as pd
import torch
import utm
import dgl
from sklearn.preprocessing import StandardScaler
import pickle

In [57]:
class WeatherDataset:
    def __init__(self, name):
        self.graph = None
        self.features_columns = ['lat', 'long', 'year', 'month', 'day', 'hour', 'forecast', 'gridpp']
        self.label_column = ['observation']
        self.stations_dict = None
        self.edges = None
        self.distances = None
        self.stations = None
        self.n_nearest = None
        self.dataframe = None
        self.name = name
        
    def create(self, path=None, n_nearest: int = 5):
        self.dataframe = pd.read_csv(path)
        self.n_nearest = n_nearest
        self.stations = self.dataframe.copy()
        self.stations.drop_duplicates(subset=['station_id'], inplace=True)
        self.__calculate_utm()
        self.distances = self.calculate_stations_distances(np.array(self.stations.utm_x), np.array(self.stations.utm_y))
        self.edges = self.__calculate_graph_structure_dataframe()
        self.stations_dict = self.__create_stations_dict()

        self.graph = self.__create_graph()

    def to_utm(self, x):
        """
        Transform lat/long coordinates in UTM
        :param x: [lat, long]
        :return: utm
        """
        return utm.from_latlon(x[0], x[1])

    def __calculate_utm(self):
        self.stations['utm'] = self.stations[['lat', 'long']].apply(lambda x: self.to_utm(x), axis=1)
        self.stations['utm_x'] = self.stations['utm'].apply(lambda x: x[0])
        self.stations['utm_y'] = self.stations['utm'].apply(lambda x: x[1])

    def calculate_stations_distances(self, x, y):
        """
        Get arrays of coordinates and calculate the distance between each other
        :param x: np.array
        :param y: np.array
        :return:
        """
        distances = np.ndarray(shape=(x.shape[0] - 1, x.shape[0]))

        for i in range(x.shape[0] - 1):
            next_x = np.concatenate((x[i + 1:], x[:i + 1]), axis=None)
            next_y = np.concatenate((y[i + 1:], y[:i + 1]), axis=None)

            diff_x = x - next_x
            diff_y = y - next_y

            diff_x_square = diff_x ** 2
            diff_y_square = diff_y ** 2

            sum_square = diff_x_square + diff_y_square

            distances[i] = np.sqrt(sum_square)

        distances = distances.transpose()

        return distances

    def __calculate_graph_structure_dataframe(self):
        """
        Get a distance array and the n_nearest param and return a DataFrame with the graph structure with edges
        linking the n_nearest stations from each station

        """
        distances = self.distances
        n_nearest = self.n_nearest

        i = 0
        src = []
        dst = []
        weight = []
        for r in distances.argsort()[:, :n_nearest]:
            for value in r:
                src.append(i)
                dst.append(value)
                weight.append(distances[i, value] / 1000)

            i += 1

        to_df = {
            'src': src,
            'dst': dst,
            'weight': weight
        }

        return pd.DataFrame(to_df)

    def __create_stations_dict(self):
        stations_dict = self.stations[['station_id']].reset_index(drop=True).to_dict()['station_id']
        new_dict = dict([(value, key) for key, value in stations_dict.items()])
        self.dataframe['node'] = self.dataframe.station_id.apply(lambda x: new_dict[x])

        return new_dict

    def create_graph_structure(self, edges_data):
        src = edges_data['src'].to_numpy()
        dst = edges_data['dst'].to_numpy()

        g = dgl.graph((src, dst))
        return g

    def __scale_data(self, features, labels):
        self.scaler_x = StandardScaler()
        self.scaler_y = StandardScaler()

        self.scaler_x.fit(features.reshape(-1, features.shape[-1]))
        self.scaler_y.fit(labels)

        return self.scaler_x.transform(features.reshape(-1, features.shape[-1])).reshape(features.shape), self.scaler_y.transform(labels)

    def __calculate_features_and_labels(self):
        shape = np.array(self.stations.utm_x).shape[0]
        max_rows = 1400 # Change that

        features = np.ndarray(shape=(shape, max_rows, len(self.features_columns)))
        labels = np.ndarray(shape=(shape, max_rows))


        for node in self.dataframe.node.unique():

            node_label = self.dataframe[self.dataframe.node == node][self.label_column].to_numpy()
            node_feature = self.dataframe[self.dataframe.node == node][self.features_columns].to_numpy()

            to_complete = int(max_rows - node_feature.shape[0])

            if to_complete > node_feature.shape[0]:
                time_to_repeat = np.floor(to_complete / node_feature.shape[0])

                node_to_add = np.repeat(node_feature, time_to_repeat, axis=0)
                label_to_add = np.repeat(node_label, time_to_repeat, axis=0)

                left_to_add = to_complete - node_to_add.shape[0]

                node_to_add = np.concatenate((node_to_add, node_feature[:left_to_add]))
                label_to_add = np.concatenate((label_to_add, node_label[:left_to_add]))

                features[node] = np.concatenate((node_feature, node_to_add), axis=0)
                labels[node] = np.concatenate((node_label, label_to_add), axis=0).reshape(max_rows)
                continue

            features[node] = np.concatenate((node_feature, node_feature[:to_complete]), axis=0)
            labels[node] = np.concatenate((node_label, node_label[:to_complete]), axis=0).reshape(max_rows)

        return features, labels

    def __create_graph(self):
        features, labels = self.__calculate_features_and_labels()
        features, labels = self.__scale_data(features, labels)
        graph = self.create_graph_structure(self.edges)
        graph.ndata['x'] = torch.from_numpy(features)
        graph.ndata['y'] = torch.from_numpy(labels)

        return graph
    
    def save(self):
        pickle.dump(self, open(f'{self.name}.pkl', 'wb'))


def read_weather_dataset(path):
    return pickle.load(open(path, 'rb'))

In [54]:
z = WeatherDataset('test1')

In [55]:
z.create('../data/data_initial_preprocessing.csv')

In [56]:
z.dataframe

Unnamed: 0,station_id,lat,long,year,month,day,hour,forecast,gridpp,observation,observation_quality,gridpp_error,node
0,SN18700,59.9423,10.7200,2019,5,13,12,12.47900,11.25000,11.1,0,0.15000,0
1,SN80740,66.9035,13.6460,2019,5,13,12,4.09424,6.15000,6.0,0,0.15000,1
2,SN61630,62.2583,8.2000,2019,5,13,12,1.81200,4.45000,4.3,0,0.15000,2
3,SN8140,61.4255,11.0803,2019,5,13,12,8.86768,9.65000,9.5,0,0.15000,3
4,SN23500,61.1220,9.0630,2019,5,13,12,7.61084,9.65860,8.6,0,1.05860,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
843910,SN61580,62.2943,8.1255,2019,5,13,18,2.25586,5.17270,5.8,0,0.62730,628
843911,SN24670,60.2684,9.6919,2019,5,13,18,9.25342,11.92026,11.0,0,0.92026,629
843912,SN90490,69.6767,18.9133,2019,5,13,18,3.75146,3.25000,3.1,0,0.15000,630
843913,SN77490,65.3143,13.4483,2019,5,13,18,-0.56055,-2.15744,-2.6,0,0.44256,631


In [47]:
z.save()

In [60]:
zzz = read_weather_dataset('test1.pkl')

In [61]:
zzz.dataframe

Unnamed: 0,station_id,lat,long,year,month,day,hour,forecast,gridpp,observation,observation_quality,gridpp_error,node
0,SN18700,59.9423,10.7200,2019,5,13,12,12.47900,11.25000,11.1,0,0.15000,0
1,SN80740,66.9035,13.6460,2019,5,13,12,4.09424,6.15000,6.0,0,0.15000,1
2,SN61630,62.2583,8.2000,2019,5,13,12,1.81200,4.45000,4.3,0,0.15000,2
3,SN8140,61.4255,11.0803,2019,5,13,12,8.86768,9.65000,9.5,0,0.15000,3
4,SN23500,61.1220,9.0630,2019,5,13,12,7.61084,9.65860,8.6,0,1.05860,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
843910,SN61580,62.2943,8.1255,2019,5,13,18,2.25586,5.17270,5.8,0,0.62730,628
843911,SN24670,60.2684,9.6919,2019,5,13,18,9.25342,11.92026,11.0,0,0.92026,629
843912,SN90490,69.6767,18.9133,2019,5,13,18,3.75146,3.25000,3.1,0,0.15000,630
843913,SN77490,65.3143,13.4483,2019,5,13,18,-0.56055,-2.15744,-2.6,0,0.44256,631


In [63]:
zzz.name

'test1'

In [25]:
teste = WeatherDataset('../data/data_initial_preprocessing.csv')

In [26]:
teste.dataframe

Unnamed: 0,station_id,lat,long,year,month,day,hour,forecast,gridpp,observation,observation_quality,gridpp_error,node
0,SN18700,59.9423,10.7200,2019,5,13,12,12.47900,11.25000,11.1,0,0.15000,0
1,SN80740,66.9035,13.6460,2019,5,13,12,4.09424,6.15000,6.0,0,0.15000,1
2,SN61630,62.2583,8.2000,2019,5,13,12,1.81200,4.45000,4.3,0,0.15000,2
3,SN8140,61.4255,11.0803,2019,5,13,12,8.86768,9.65000,9.5,0,0.15000,3
4,SN23500,61.1220,9.0630,2019,5,13,12,7.61084,9.65860,8.6,0,1.05860,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
843910,SN61580,62.2943,8.1255,2019,5,13,18,2.25586,5.17270,5.8,0,0.62730,628
843911,SN24670,60.2684,9.6919,2019,5,13,18,9.25342,11.92026,11.0,0,0.92026,629
843912,SN90490,69.6767,18.9133,2019,5,13,18,3.75146,3.25000,3.1,0,0.15000,630
843913,SN77490,65.3143,13.4483,2019,5,13,18,-0.56055,-2.15744,-2.6,0,0.44256,631


In [27]:
teste.graph.ndata['y']

tensor([[ 0.9972,  1.2596,  0.1644,  ..., -0.4117, -0.3488, -0.0306],
        [-0.2573,  0.2713,  0.7956,  ..., -0.7745, -0.6684, -0.1543],
        [-0.6754, -0.7470, -0.7251,  ..., -0.3209, -0.8663, -2.0702],
        ...,
        [-3.7009, -3.3526, -2.3320,  ..., -2.6039, -2.6772, -2.5337],
        [-3.2089, -3.5922, -3.6806,  ..., -1.4246, -1.5967, -2.0084],
        [-2.2988, -1.7653, -1.2416,  ..., -1.5758, -1.5815, -1.6066]],
       dtype=torch.float64)

In [28]:
teste.graph

Graph(num_nodes=667, num_edges=3335,
      ndata_schemes={'x': Scheme(shape=(1400, 8), dtype=torch.float64), 'y': Scheme(shape=(1400,), dtype=torch.float64)}
      edata_schemes={})

In [29]:
g = teste.graph

In [30]:
g.ndata['x'][0]

tensor([[-0.6825,  0.0338,  0.0000,  ...,  0.4375,  0.9626,  0.7316],
        [-0.6825,  0.0338,  0.0000,  ..., -0.4589,  0.1257,  0.3143],
        [-0.6825,  0.0338,  0.0000,  ..., -1.3554,  0.0291, -0.3434],
        ...,
        [-0.6825,  0.0338,  0.0000,  ..., -0.4589,  0.0096,  0.0108],
        [-0.6825,  0.0338,  0.0000,  ..., -1.3554, -0.0342,  0.0613],
        [-0.6825,  0.0338,  0.0000,  ...,  1.3340,  0.2528,  0.3269]],
       dtype=torch.float64)

In [31]:
teste

<__main__.WeatherDataset at 0x7f5b5c6908b0>

In [35]:
pickle.dump(teste, open('asd.pkl', 'wb'))

In [37]:
az = pickle.load(open('asd.pkl', 'rb'))

In [38]:
az

<__main__.WeatherDataset at 0x7f5b5e184ee0>

In [39]:
az.dataframe

Unnamed: 0,station_id,lat,long,year,month,day,hour,forecast,gridpp,observation,observation_quality,gridpp_error,node
0,SN18700,59.9423,10.7200,2019,5,13,12,12.47900,11.25000,11.1,0,0.15000,0
1,SN80740,66.9035,13.6460,2019,5,13,12,4.09424,6.15000,6.0,0,0.15000,1
2,SN61630,62.2583,8.2000,2019,5,13,12,1.81200,4.45000,4.3,0,0.15000,2
3,SN8140,61.4255,11.0803,2019,5,13,12,8.86768,9.65000,9.5,0,0.15000,3
4,SN23500,61.1220,9.0630,2019,5,13,12,7.61084,9.65860,8.6,0,1.05860,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
843910,SN61580,62.2943,8.1255,2019,5,13,18,2.25586,5.17270,5.8,0,0.62730,628
843911,SN24670,60.2684,9.6919,2019,5,13,18,9.25342,11.92026,11.0,0,0.92026,629
843912,SN90490,69.6767,18.9133,2019,5,13,18,3.75146,3.25000,3.1,0,0.15000,630
843913,SN77490,65.3143,13.4483,2019,5,13,18,-0.56055,-2.15744,-2.6,0,0.44256,631


In [40]:
az.graph

Graph(num_nodes=667, num_edges=3335,
      ndata_schemes={'x': Scheme(shape=(1400, 8), dtype=torch.float64), 'y': Scheme(shape=(1400,), dtype=torch.float64)}
      edata_schemes={})

In [41]:
az.graph.ndata['x']

tensor([[[-0.6825,  0.0338,  0.0000,  ...,  0.4375,  0.9626,  0.7316],
         [-0.6825,  0.0338,  0.0000,  ..., -0.4589,  0.1257,  0.3143],
         [-0.6825,  0.0338,  0.0000,  ..., -1.3554,  0.0291, -0.3434],
         ...,
         [-0.6825,  0.0338,  0.0000,  ..., -0.4589,  0.0096,  0.0108],
         [-0.6825,  0.0338,  0.0000,  ..., -1.3554, -0.0342,  0.0613],
         [-0.6825,  0.0338,  0.0000,  ...,  1.3340,  0.2528,  0.3269]],

        [[ 1.2653,  0.5924,  0.0000,  ...,  0.4375, -0.1192,  0.0866],
         [ 1.2653,  0.5924,  0.0000,  ..., -0.4589, -0.1317, -0.1031],
         [ 1.2653,  0.5924,  0.0000,  ..., -1.3554, -0.1765, -0.0651],
         ...,
         [ 1.2653,  0.5924,  0.0000,  ..., -0.4589, -0.1996, -0.2928],
         [ 1.2653,  0.5924,  0.0000,  ..., -1.3554, -0.1468, -0.2043],
         [ 1.2653,  0.5924,  0.0000,  ...,  1.3340,  0.1298,  0.2258]],

        [[-0.0345, -0.4473,  0.0000,  ...,  0.4375, -0.4137, -0.1284],
         [-0.0345, -0.4473,  0.0000,  ..., -0