## Imports and Setup - Data prep

In [None]:
import tensorflow as tf
from tensorflow import keras
import IPython, IPython.display, os, datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mpl.rcParams['figure.figsize'] = (14, 4)
mpl.rcParams['axes.grid'] = True

In [None]:
# Lets prepare the data and save it away
# You may consider doing preparation during the training once you get the statistics about the data
# But we will just do it here.

# Note you can read the zip file directly in pandas

file = '../data/jena_climate_2009_2016.csv.zip'
df = pd.read_csv(file)
df['Date Time'] = pd.to_datetime( df['Date Time'], format='%d.%m.%Y %H:%M:%S' )
print("Data Collected every 10 minutes")
display(df)

In [None]:
colors = "b g r c m y k ".split()
plt.figure(figsize=(20, 20) )
for i in range(1,len(df.columns)):
    plt.subplot(5, 3, i)
    col = df.columns[i]
    plt.plot(df['Date Time'], df[col], color=colors[i%len(colors)])
    plt.title(col)

In [None]:
df.describe().transpose()

In [None]:
# Fix Wind velocity that shows up as-9999 - lets set its minimum to 0
# Check to make sure it is fixed

df.loc[df['wv (m/s)'] < 0, 'wv (m/s)'] = 0
df.loc[df['max. wv (m/s)'] < 0, 'max. wv (m/s)'] = 0
df.describe().transpose()

In [None]:
colors = "b g r c m y k ".split()
plt.figure(figsize=(20, 20) )
for i in range(1,len(df.columns)):
    plt.subplot(5, 3, i)
    col = df.columns[i]
    plt.plot(df['Date Time'], df[col], color=colors[i%len(colors)])
    plt.title(col)

In [None]:
df = df[5::6]      # Lets takes hourly data - th data is i minutes, every 6th entry corresponds to hour
df

### Add features


In [None]:
# Time column in and of itself is not so useful 
# the time that that indicates week day or weekend tells you more about the traffic
# A time that tells day or night can give more info on the temperature

timestamp_s = df['Date Time'].map(pd.Timestamp.timestamp)

day = 24*60*60
year = (365.2425)*day

df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [None]:
plt.figure(figsize=(16,5))
plt.xlabel('Time [h]')
plt.title('Time of day signal')
plt.plot(df['Date Time'][:25], df['Day sin'] [:25] , c="r", label="Day Sin")
plt.plot(df['Date Time'][:25], df['Day cos'] [:25] , c="b", label="Day Cos")
plt.legend()
plt.show()

plt.figure(figsize=(16,5))
plt.plot(df['Date Time'][:365*24], df['Year sin'][:365*24], c="k", label="Year Sin")
plt.plot(df['Date Time'][:365*24], df['Year cos'][:365*24], c="g", label="Year Cos")
plt.xlabel('Time [h]')
plt.title('Time of Year signal')
plt.legend();

### Save data to file

Save the file to disk. You may consider do other transformation such as one-hot encoding etc.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle

scaler = StandardScaler()
split  = int(len(df) * .8)
df_trn  = df[df.columns[1:]][:split]
df_tst  = df[df.columns[1:]][split:]
scaler.fit(df_trn)

df_scaled_trn = pd.DataFrame(scaler.transform(df_trn), columns=df_trn.columns)
df_scaled_tst = pd.DataFrame(scaler.transform(df_tst), columns=df_trn.columns)

# Save the scaler to a pickle file
pickle.dump(scaler, open(f'{file}.scaler.pkl', 'wb'))

##=> You can load the scaler in future 
# scaler = pickle.load(open(f'{file}.scaler.pkl', 'rb'))
# scaler.transform(df_train)

# You can inverse transform predicted value to get original value 
# pd.DataFrame(scaler.inverse_transform(scaler.transform(df_train)))

# index = False tells not to write the index to CSV file
df.to_csv(file+".csv", index=False)
df_scaled_trn.to_csv(file+".trn.csv", index=False)
df_scaled_tst.to_csv(file+".tst.csv", index=False)

df_scaled_trn[0:10]