In [36]:
from datetime import datetime
from math import cos, sin
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import tensorflow
from tensorflow import keras
from tensorflow.keras import layers
import torch
import torch.nn as nn
from torch.autograd import Variable

## Initiating Preprocessor

In [2]:
from lib.preprocessor import PreProcessor

pp = PreProcessor(data_path='data/')

Loading Data Files...
Calculating and Adding Cell Polygon Center...
Build Station Meta Dict...
Done with initial Loading.


In [3]:
pp.station_knn_impute()

Imputing Station Test Data with KNN...
NaNs in station_test: 5085
New NaN Count in station_test: 0


In [92]:
x_train, y_train, x_test, y_test = pp.get_station_x_y(scale=True)

In [20]:
def sliding_windows(X, Y, seq_length):
    x = []
    y = []

    for i in range(len(X)-seq_length-1):
        _x = X[i:(i+seq_length), :]
        _y = Y[i+seq_length]
        x.append(_x)
        y.append(_y)

    return np.array(x), np.array(y)

In [27]:
print(x.shape)
print(y.shape)

(149094, 5, 6)
(149094,)


In [85]:
from torch.utils.data import DataLoader, TensorDataset

x_train, y_train = sliding_windows(x_train, y_train, 5)
x_test, y_test = sliding_windows(x_test, y_test, 5)

x_train = torch.Tensor(np.array(x_train))
y_train = torch.Tensor(np.array(y_train))

x_test = torch.Tensor(np.array(x_test))
y_test = torch.Tensor(np.array(y_test))

train_data = TensorDataset(x_train, y_train)
train_data = DataLoader(train_data, batch_size=10)

test_data = TensorDataset(x_test, y_test)
test_data = DataLoader(test_data, batch_size=10)

In [57]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = 5
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size)).to("cuda")
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size)).to("cuda")
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        
        h_out = h_out.view(-1, self.hidden_size)
        
        out = self.fc(h_out)
        
        return out

In [None]:
x = x.to("cuda")
y = y.to("cuda")
lstm = LSTM(1, 6, 100, 1).to("cuda")
loss_fn = torch.nn.MSELoss().to("cuda")
optim = torch.optim.Adam(lstm.parameters(), 0.1)
epochs = 2000
for epoch in range(epochs):
    for xs, ys in train_data:
        xs = xs.to("cuda")
        ys = ys.to("cuda")
        preds = lstm(xs).squeeze(1)
        optim.zero_grad()
        loss = loss_fn(preds, ys)
        loss.backward()
        optim.step()
    if epoch % 1 == 0:
        print(f"Train Loss at epoch: {epoch} = {loss.item()}")
        with torch.no_grad():
            test_losses = []
            for xstest, ystest in test_data:
                xstest = xstest.to("cuda")
                ystest = ystest.to("cuda")
                preds_test = lstm(xstest).squeeze(1)
                loss_test = loss_fn(preds_test, ystest)
                test_losses.append(loss_test.item())
            test_losses = np.array(test_losses)
            print(f"Test Loss at epoch: {epoch} = {np.mean(test_losses[~np.isnan(test_losses)])}")

Train Loss at epoch: 0 = 264.6009521484375
Test Loss at epoch: 0 = 191.1125036301818
Train Loss at epoch: 1 = 95.82516479492188
Test Loss at epoch: 1 = 214.64412251708654
Train Loss at epoch: 2 = 62.277610778808594
Test Loss at epoch: 2 = 171.79561360542817
Train Loss at epoch: 3 = 69.77548217773438
Test Loss at epoch: 3 = 164.6762024110653
Train Loss at epoch: 4 = 72.7020263671875
Test Loss at epoch: 4 = 178.02245371243882
Train Loss at epoch: 5 = 74.71651458740234
Test Loss at epoch: 5 = 182.2501375125349
Train Loss at epoch: 6 = 76.45557403564453
Test Loss at epoch: 6 = 184.82338377364235
Train Loss at epoch: 7 = 66.67644500732422
Test Loss at epoch: 7 = 163.94703107354596
Train Loss at epoch: 8 = 76.87396240234375
Test Loss at epoch: 8 = 208.65408441210982
Train Loss at epoch: 9 = 75.22152709960938
Test Loss at epoch: 9 = 207.46543080249208
Train Loss at epoch: 10 = 71.62803649902344
Test Loss at epoch: 10 = 229.03352516143116
Train Loss at epoch: 11 = 85.7958755493164
Test Loss at

In [5]:
model = keras.Sequential([
    layers.Input(shape=(6,)),
    layers.Dense(20, activation='relu'),
    layers.Dense(20, activation='relu'),
    layers.Dense(1, activation='linear')
])
opt = keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss='mean_squared_error',
              optimizer=opt)

In [6]:
model.fit(x_train, y_train, epochs=5)

Train on 149100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1e04b42dc88>

In [7]:
x_cell_train, y_cell_train = pp.get_cell_x_y(model, neighbor_n=3)

Building Neighbor Dict...
Done................
Predicting Station Data and building x y train...
Make sure the scaler was used already.
Done.......


In [9]:
pd.DataFrame(y_cell_train).isnull().any()

0    False
dtype: bool

In [13]:
df = pp.submission_format.copy(deep=True)
station_x, station_ids = pp.get_station_x_pre()

ci = 1
for c in df.columns:
    if c == 'cell_id':
        continue
    date = datetime.strptime(c, '%Y-%m-%d')
    station_x[:, 3] = date.year
    station_x[:, 4] = date.month
    station_x[:, 5] = date.day
    x_to_predict = pp.min_max_scaling(station_x)
    station_prediction = model.predict(x_to_predict)

    x = []
    for ii, r in df.iterrows():
        cell_id = r['cell_id']
        this_x = []
        for n in pp.neighbor_map[cell_id]:
            this_x.append(n[1])
            this_x.append(n[2])
            this_x.append(n[3])
            this_x.append(station_prediction[station_ids.index(n[0])][0])

        this_x.append(date.month)
        this_x.append(date.day)
        x.append(np.array(this_x, dtype='float64'))
    x = pp.cell_scaling(np.array(x, dtype='float64'))
    pred = model2.predict(x)
    pred = [i[0] for i in pred]
    pred = np.array(pred, dtype='float32')
    df.drop(c, axis=1, inplace=True)
    df[c] = pred
    ci += 1
    print('%s/%s' % (ci, pp.submission_format.shape[1]), end='\r')



ERROR! Session/line number was not unique in database. History logging moved to new session 143
58/58

In [15]:
df.to_csv('first_submission.csv', index=False)