In [1]:
from datetime import datetime
from math import cos, sin
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import tensorflow
from tensorflow import keras
from tensorflow.keras import layers

## Initiating Preprocessor

In [2]:
from lib.preprocessor import PreProcessor

pp = PreProcessor(data_path='data/')

Loading Data Files...
Calculating and Adding Cell Polygon Center...
Build Station Meta Dict...
Done with initial Loading.


In [3]:
pp.station_knn_impute()

Imputing Station Test Data with KNN...
NaNs in station_test: 5085
New NaN Count in station_test: 0


In [4]:
x_train, y_train, x_test, y_test = pp.get_station_x_y(scale=True)

In [5]:
model = keras.Sequential([
    layers.Input(shape=(6,)),
    layers.Dense(20, activation='relu'),
    layers.Dense(20, activation='relu'),
    layers.Dense(1, activation='linear')
])
opt = keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss='mean_squared_error',
              optimizer=opt)

In [6]:
model.fit(x_train, y_train, epochs=5)

Train on 149100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1e04b42dc88>

In [7]:
x_cell_train, y_cell_train = pp.get_cell_x_y(model, neighbor_n=3)

Building Neighbor Dict...
Done................
Predicting Station Data and building x y train...
Make sure the scaler was used already.
Done.......


In [8]:
model2 = keras.Sequential([
    layers.Dense(14),
    layers.Dense(20, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(20, activation='relu'),
    layers.Dense(1)
])
opt = keras.optimizers.Adam(learning_rate=0.01)
model2.compile(loss='mean_squared_error',
              optimizer=opt)

In [9]:
pd.DataFrame(y_cell_train).isnull().any()

0    False
dtype: bool

In [10]:
model2.fit(x_cell_train, y_cell_train, epochs=20)

Train on 91490 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1e207f52390>

{'0003f387-71c4-48f6-b2b0-d853bd4f0aba': [['CDEC:BCB',
   0.0651475970459856,
   0.05405682718021865,
   0.011090769865766958],
  ['CDEC:SWM', 0.24054940295404492, 0.15636117281979978, 0.08418823013424515],
  ['CDEC:WWC', 0.24682259704596987, 0.1990458271801998, 0.04777676986577006]],
 '000617d8-8c14-43e2-b708-7e3a69fe3cc3': [['SNOTEL:839_CO_SNTL',
   0.2453942511977658,
   0.1833600040957606,
   0.062034247102005224],
  ['SNOTEL:762_CO_SNTL',
   0.3333084840445366,
   0.12648286786529184,
   0.20682561617924478],
  ['SNOTEL:327_CO_SNTL',
   0.5052209113540158,
   0.43533601727935434,
   0.06988489407466147]],
 '000863e7-21e6-477d-b799-f5675c348627': [['CDEC:TMR',
   0.3373010512464276,
   0.20114178875058997,
   0.13615926249583765],
  ['CDEC:HNT', 0.38117005124643555, 0.18119078875059813, 0.19997926249583742],
  ['CDEC:CHM', 0.47131447374522395, 0.09051521124938233, 0.3807992624958416]],
 '000ba8d9-d6d5-48da-84a2-1fa54951fae1': [['CDEC:GRV',
   0.05609847557190761,
   0.0279324131799

In [13]:
df = pp.submission_format.copy(deep=True)
station_x, station_ids = pp.get_station_x_pre()

ci = 1
for c in df.columns:
    if c == 'cell_id':
        continue
    date = datetime.strptime(c, '%Y-%m-%d')
    station_x[:, 3] = date.year
    station_x[:, 4] = date.month
    station_x[:, 5] = date.day
    x_to_predict = pp.min_max_scaling(station_x)
    station_prediction = model.predict(x_to_predict)

    x = []
    for ii, r in df.iterrows():
        cell_id = r['cell_id']
        this_x = []
        for n in pp.neighbor_map[cell_id]:
            this_x.append(n[1])
            this_x.append(n[2])
            this_x.append(n[3])
            this_x.append(station_prediction[station_ids.index(n[0])][0])

        this_x.append(date.month)
        this_x.append(date.day)
        x.append(np.array(this_x, dtype='float64'))
    x = pp.cell_scaling(np.array(x, dtype='float64'))
    pred = model2.predict(x)
    pred = [i[0] for i in pred]
    pred = np.array(pred, dtype='float32')
    df.drop(c, axis=1, inplace=True)
    df[c] = pred
    ci += 1
    print('%s/%s' % (ci, pp.submission_format.shape[1]), end='\r')



ERROR! Session/line number was not unique in database. History logging moved to new session 143
58/58

In [15]:
df.to_csv('first_submission.csv', index=False)