# First Start

In [30]:
from datetime import datetime
from math import cos, sin
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import tensorflow
from tensorflow import keras
from tensorflow.keras import layers

In [31]:
station_metadata = pd.read_csv('data/ground_measures_metadata.csv')
grid_geodata = gpd.read_file('data/grid_cells.geojson')
submission_format = pd.read_csv('data/submission_format.csv')
train_labels = pd.read_csv('data/train_labels.csv')

measures_test = pd.read_csv('data/ground_measures_test.csv')
measures_train = pd.read_csv('data/ground_measures_train.csv')

In [32]:
station_meta = {}
for oi, r in station_metadata.iterrows():
    station_meta[r[0]] = {
        'elev': r['elevation_m'],
        'lat': r['latitude'],
        'long': r['longitude']
    }

In [34]:
from lib.date_filler import date_filler

measures_train = date_filler(measures_train, dataset='train')
measures_test = date_filler(measures_test, dataset='test')

In [35]:
measures_train

Unnamed: 0.1,Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,...,2019-12-22,2019-12-23,2019-12-24,2019-12-25,2019-12-26,2019-12-27,2019-12-28,2019-12-29,2019-12-30,2019-12-31
0,CDEC:ADM,5.90,,,,,,,5.90,,...,,,3.70,,,,,,,3.40
1,CDEC:AGP,17.52,,,,,,,17.54,,...,,,,,,,,,,
2,CDEC:ALP,12.75,,,,,,,13.32,,...,,,12.67,,,,,,,12.57
3,CDEC:BCB,4.30,,,,,,,4.42,,...,,,,,,,,,,
4,CDEC:BCH,2.88,,,,,,,3.00,,...,,,5.04,,,,,,,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,SNOTEL:989_ID_SNTL,9.00,,,,,,,10.20,,...,,,2.80,,,,,,,3.00
696,SNOTEL:990_WA_SNTL,27.50,,,,,,,29.10,,...,,,8.70,,,,,,,8.60
697,SNOTEL:992_UT_SNTL,4.10,,,,,,,4.10,,...,,,3.60,,,,,,,3.80
698,SNOTEL:998_WA_SNTL,48.40,,,,,,,55.50,,...,,,23.70,,,,,,,25.00


In [36]:
def df_to_xy(dataframe: pd.DataFrame):
    x = []
    y = []
    for oi, j in dataframe.iterrows():
        lat = station_meta[j[0]]['lat']
        long = station_meta[j[0]]['long']
        elev = station_meta[j[0]]['elev']
        for k, e in j.items():
            if k == 'Unnamed: 0':
                continue
            dt = datetime.strptime(k, '%Y-%m-%d')
            date = dt.date()
            x.append(np.array([elev, lat, long, date.year - 2000, date.month, date.day]))
            y.append(e)

    return np.array(x), np.array(y)

In [37]:
x_train_pre, y_train_pre = df_to_xy(measures_train)
x_test_pre, y_test_pre = df_to_xy(measures_test)

In [38]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor

In [39]:
def minmaxscaler(xs):
    scaler = MinMaxScaler()
    scaler.fit(xs)
    xs = scaler.transform(xs)
    return xs

In [48]:
def impute_data(x_train_pre_imp, y_train_pre_imp, x_test_pre_imp, y_test_pre_imp):

    # Get indices of nan-values in y_train_pre_imp and y_test_pre_imp
    l_train = np.argwhere(np.isnan(y_train_pre_imp))
    l_test = np.argwhere(np.isnan(y_test_pre_imp))
    ind_train = []
    ind_test = []

    for index in l_train:
        ind_train.append(index[0])
    for index in l_test:
        ind_test.append(index[0])

    # create concatenated x/y dataset
    x_1 = np.delete(x_train_pre_imp, ind_train, 0)
    x_2 = np.delete(x_test_pre_imp, ind_test, 0)
    y_1 = np.delete(y_train_pre_imp, ind_train, 0)
    y_2 = np.delete(y_test_pre_imp, ind_test, 0)
    x_train_concatenated = np.concatenate((x_1, x_2), axis=0)
    y_train_concatenated = np.concatenate((y_1, y_2), axis=0)

    # create dataset for x-values with missing y
    x_pred = []
    for x in ind_train:
        x_pred.append(x_train_pre_imp[x])
    for x in ind_test:
        x_pred.append(x_test_pre_imp[x])
    x_pred = np.asarray(x_pred)

    # scale the dataset
    x_train_concatenated = minmaxscaler(x_train_concatenated)
    x_pred = minmaxscaler(x_pred)

    # Create validation / train sets for model
    x_train, x_val, y_train, y_val = train_test_split(x_train_concatenated, y_train_concatenated, test_size=0.05, random_state=42)

    # create kNN-Regression model for predictions
    knn_reg = KNeighborsRegressor(n_neighbors=5, algorithm='auto')
    knn_reg.fit(x_train_concatenated, y_train_concatenated)



    # Predict missing values
    y_pred_real = knn_reg.predict(x_pred)

    # put the predictions back into datasets from the beginning
    # indices from missing values in y_train and y_test where stored in ind_train, ind_test
    # y_pred_real has values for these indices in ascending order

    # split predictions
    values_for_y_train = y_pred_real[:len(ind_train)]
    values_for_y_test = y_pred_real[len(ind_train):]

    y_train = np.copy(y_train_pre_imp)
    y_test = np.copy(y_test_pre_imp)

    for i in range(len(ind_train)):
        y_train[ind_train[i]] = values_for_y_train[i]

    for i in range(len(ind_test)):
        y_test[ind_test[i]] = values_for_y_test[i]

    return x_train_pre_imp, y_train, x_test_pre_imp, y_test

In [49]:
x_train, y_train, x_test, y_test = impute_data(x_train_pre, y_train_pre, x_test_pre, y_test_pre)
x = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [50]:
x = minmaxscaler(x)

### First Model for Station Data Forcasting

In [54]:
model = keras.Sequential([
    layers.Input(shape=(6,)),
    layers.Dense(10, activation='tanh'),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='linear')
])
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='mean_squared_error',
              optimizer=opt)

In [55]:
model.fit(x, y, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9e966bed60>

In [56]:
y_pred = model.predict(x_test)
print('RMSE: ')
print(mean_squared_error(y_test, y_pred, squared=False))

print('\nR2:')
print(r2_score(y_test, y_pred))

RMSE: 
48.48484476752558

R2:
-37.09320688610651


In [134]:
geodata = gpd.read_file('data/grid_cells.geojson')
ground_measures = pd.read_csv("data/ground_measures_metadata.csv")
submission = pd.read_csv("data/submission_format.csv")

In [135]:
submission["location"] = np.nan
submission["elev"] = np.nan
ids_geo = geodata["cell_id"]
ids_sub = submission["cell_id"]
ground_measures["coord"] = np.empty((len(ground_measures), 0)).tolist()

In [136]:
submission["location"] = geodata[ids_geo.isin(ids_sub)].reset_index()["geometry"]

def get_middle(pg):
    xy = pg.exterior.coords.xy
    x, y = xy[0], xy[1]
    x1, x2 = x[0], x[1]
    y1, y2 = y[0], y[1]
    mid_x = (x1 + x2) / 2
    mid_y = (y1 + y2) / 2
    assert x1 <= mid_x <= x2, "Something is off: x"
    assert y1 <= mid_y <= y2, "Something is off: y"
    return mid_x, mid_y

submission["location"] = submission["location"].apply(get_middle)

In [137]:
for i, row in ground_measures.iterrows():
    ground_measures.at[i, "coord"] = (row[4], row[3])

In [138]:
def find_closest_station(location, station_locations):
    _min = np.inf
    _min_idx = None
    location = np.array(location)
    for i in range(len(station_locations)):
        curr_loc = np.array(station_locations[i])
        distance = np.sum((location - curr_loc)**2)
        if distance < _min:
            _min = distance
            _min_idx = i
    return _min_idx, _min

In [139]:
for i, row in submission.iterrows():
    location = row[-2]
    station_locations = ground_measures["coord"]
    idx, _ = find_closest_station(location, station_locations)
    closest_elev = ground_measures.at[idx, "elevation_m"]
    submission.at[i, "elev"] = closest_elev

In [140]:
x_test = []
dates = list(submission.columns)[1:-2]
for i, row in submission.iterrows():

    elev = row[-1]
    location = row[-2]
    batched = []
    for d in dates:
        date = datetime.strptime(d, '%Y-%m-%d')
        feature = np.array([elev, location[1], location[0], date.year, date.month, date.day]).reshape(1,-1)
        scaled_feature = sc.min_max_scaling(feature)
        batched.append(scaled_feature)


In [141]:
prediction = model.predict(np.array(batched).reshape(-1,6))
for pred, d in zip(prediction, dates):
    submission.at[i, d] = pred

In [143]:
submission.drop(["elev", "location"], axis=1).to_csv('second_submission.csv', index=False)