In [1]:
from google.cloud import bigquery
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import datetime 
import requests
import json
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
# nvcc --version  ###CUDA version

import pathlib
import shutil
from numpy.random import seed
seed(2)
tf.random.set_seed(2)


random_state = 42
import mlflow
exp_id = 'weather_dataset'
mlflow.set_experiment(exp_id)
mlflow.set_tracking_uri('file:///C:/Users/gabri/VSCode%20Projects/Weather%20Prediction/mlruns')
mlflow.autolog()
mlflow.log_param('random_state', random_state)
# export MLFLOW_TRACKING_URI=http://192.168.0.1:5000



* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:48:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2023/09/30 11:48:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/09/30 11:48:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


42

In [2]:
# 1) define training, testing and prediction daterange 
# 2) define parameters, feature engineering and transformations, create dataframe 
# 3) convert to numpy and reshape 
# 4) define train, test and val split
# 5) normalize
# 6) define model and log in MLFlow
# 7) model training

In [3]:
# project details and gcp connection
project_id = "galvanic-smoke-394310"
key_path = r'C:\Users\gabri\GCP\galvanic-smoke-394310-c595ae82c676_bigquery.json'
client = bigquery.Client.from_service_account_json(key_path)

In [4]:
# get data query
training_start = '2021-08-01'
training_end = '2021-08-11'
date_obj_start = datetime.datetime.strptime(training_start, '%Y-%m-%d')
date_obj_end = datetime.datetime.strptime(training_end, '%Y-%m-%d')
unix_start = int(date_obj_start.timestamp())
unix_end = int(date_obj_end.timestamp())

query = f"""
select dt, temp - 273.15 as temp, pressure, humidity
from galvanic-smoke-394310.weather_training.weather_city_of_london_training
where dt > {unix_start} and dt < {unix_end}
order by dt
"""
query_job = client.query(query)
df_filtered_cl = query_job.to_dataframe()
df_filtered_cl.head(2)

Unnamed: 0,dt,temp,pressure,humidity
0,1627776000,15.49,1010,92
1,1627779600,15.43,1010,92


In [5]:
# get data query
query = f"""
select dt, temp - 273.15 as temp, pressure, humidity
from galvanic-smoke-394310.weather_training.weather_cockfosters_historical
where dt > {unix_start} and dt < {unix_end}
order by dt
"""
query_job = client.query(query)
df_filtered_cf = query_job.to_dataframe()
df_filtered_cf.head(2)

Unnamed: 0,dt,temp,pressure,humidity
0,1627776000,15.22,1018,93
1,1627779600,15.0,1018,93


In [6]:
df_filtered_cl.rename(columns={'dt': 'dt_cl', 'temp': 'cl_temp', 'pressure': 'cl_pressure', 'humidity': 'cl_humidity'}, inplace=True)
df_filtered_cf.rename(columns={'dt': 'dt_cf', 'temp': 'cf_temp', 'pressure': 'cf_pressure', 'humidity': 'cf_humidity'}, inplace=True)
merged_df = pd.merge(df_filtered_cl, df_filtered_cf, how='left', left_on='dt_cl', right_on='dt_cf')
final_df = merged_df[['dt_cl', 'cl_temp', 'cl_pressure', 'cl_humidity', 'cf_temp', 'cf_pressure', 'cf_humidity']]
final_df.rename(columns={'dt_cl':'dt'}, inplace=True)
# Feature selection
features_list = ['cl_temp']
final_df = final_df[['dt']+features_list]
final_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'dt_cl':'dt'}, inplace=True)


Unnamed: 0,dt,cl_temp
0,1627776000,15.49
1,1627779600,15.43


In [7]:
index_list=[]
timestamp_list=[]
for idx, timestamp in enumerate(final_df['dt'].to_numpy()):
    index_list.append(idx)
    timestamp_list.append(timestamp)

datetime_list = [datetime.datetime.utcfromtimestamp(event) for event in timestamp_list]
final_df = final_df.drop(columns='dt', axis=0)

In [8]:
# define data windows
input_days = 1
window_size_hours = input_days * 24
prediction_length_hours = 24
print('input_days:', input_days)
print('window_size_hours:', window_size_hours)
print('prediction_length_hours:', prediction_length_hours)

input_days: 1
window_size_hours: 24
prediction_length_hours: 24


In [9]:
# 7 Define train, val and test ratios
n = final_df.shape[0]
train = int(n*0.7)
val = int(n*0.85)
test = len(final_df)
train_timestamp = datetime_list[:train]
val_timestamp = datetime_list[train:val]
test_timestamp = datetime_list[val:test]

n, train, val, test, val-train, test-val

(240, 168, 204, 240, 36, 36)

In [15]:
import warnings

warnings.filterwarnings("ignore", message="The behavior of DatetimeProperties.to_pydatetime is deprecated*")
fig = go.Figure()

all_trace = px.scatter(x=datetime_list, y=final_df['cl_temp'].to_list()).data[0]
all_trace.update(line=dict(color='yellow'), marker=dict(color='yellow'), name=f'All Data: {n}', showlegend=True, mode='lines')
train_trace = px.scatter(x=train_timestamp, y=final_df['cl_temp'].to_list()[:train]).data[0]
train_trace.update(line=dict(color='blue'), marker=dict(color='blue'), name=f'Train Data: {train}', showlegend=True)
val_trace = px.scatter(x=val_timestamp, y=final_df['cl_temp'].to_list()[train:val]).data[0]
val_trace.update(line=dict(color='green'), marker=dict(color='green'), name=f'Validation Data: {val-train}', showlegend=True)
test_trace = px.scatter(x=test_timestamp, y=final_df['cl_temp'].to_list()[val:test]).data[0]
test_trace.update(line=dict(color='red'), marker=dict(color='red'), name=f'Test Data: {test-val}', showlegend=True)
fig.add_trace(train_trace)
fig.add_trace(val_trace)
fig.add_trace(test_trace)
fig.add_trace(all_trace)
fig.update_layout(showlegend=True)
fig.show()

In [16]:
# 8 Normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(final_df.iloc[:train, :])
df_scaled = pd.DataFrame(scaler.transform(final_df))
df_scaled.columns = final_df.columns
# df_scaled.index = final_df.index

In [17]:
def df_to_X_y(df, window_size_hours):
  df_as_np = df.to_numpy()
  X = []
  y = []
  for i in range(len(df_as_np)-window_size_hours):
    row = [r for r in df_as_np[i:i+window_size_hours]]
    X.append(row)
    
    label = [r for r in df_as_np[i+window_size_hours]]
    y.append(label)
  return np.array(X), np.array(y)

In [None]:
# 11 Split train, val and test 
X, y = df_to_X_y(final_df, window_size_hours)

X_train, y_train = X[:train], y[:train]
X_val, y_val = X[train:val], y[train:val]
X_test, y_test = X[val:test], y[val:test]
print('Total timesteps, features:', final_df.shape), 
print('Training (ts, window, feat) (ts, feat):', X_train.shape, y_train.shape), 
print('Validation (ts, window, feat) (ts, feat):', X_val.shape, y_val.shape, ), 
print('Test (ts, window, feat) (ts, feat):', X_test.shape, y_test.shape, )

In [None]:
# fig = go.Figure()
# fig.add_traces(px.line(x=datetime_list[0:window_size_hours], y=X[0, :, 0]).data)
# fig.update_layout(width=800, height=400)
# fig.show()

In [None]:
# # 12 Inverse y_test
# y_test = scaler.inverse_transform(y_test)

In [None]:
# # 13 Define architecture

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import *
# from tensorflow.keras.callbacks import ModelCheckpoint
# from tensorflow.keras.losses import MeanSquaredError
# from tensorflow.keras.metrics import RootMeanSquaredError
# from tensorflow.keras.optimizers import Adam
import mlflow
import numpy as np
import pathlib
import shutil
from tensorflow import keras
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, InputLayer

feature_count = y_train.shape[1]
model = keras.Sequential()

model.add(InputLayer((window_size_hours, feature_count)))
model.add(Bidirectional(LSTM(4, activation='tanh')))
model.add(Dropout(0.25))
model.add(Dense(feature_count, 'linear'))

model.summary()

In [None]:
from keras.metrics import RootMeanSquaredError
from keras.optimizers import Adam

learning_rate = 0.1
epochs = 2
loss = 'mse'
batch_size = 32
model.compile(loss=loss, optimizer=Adam(learning_rate = learning_rate), metrics=[RootMeanSquaredError()])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

In [None]:
# model.save('cf_cl.h5')
# model.save('cf_cl.keras')
# # loaded_model = tf.keras.models.load_model('heath.h5')
# loaded_model = tf.keras.models.load_model('cf_cl.keras')
# loaded_model

In [None]:
# # 17 Predict y_predict from entire X_test dataset 
# y_predict = scaler.inverse_transform(model.predict(x=X_test))
# y_val_inverse = scaler.inverse_transform(y_val)

# print('Number of predicted values from X_test:', y_predict.shape[0])
y_predict = scaler.inverse_transform(model.predict(x=X_test))
y_true = final_df.iloc[val:test]['cl_temp']
y_predict = y_predict[:, 0]

In [None]:
X_test.shape, y_test.shape

In [None]:
fig = go.Figure()
train_trace = px.scatter(x=test_timestamp, y=y_predict).data[0]
# train_trace.update(line=dict(color='blue'), marker=dict(color='blue'), name='Train Data', showlegend=True)
# val_trace = px.scatter(x=val_timestamp, y=final_df['cl_temp'].to_list()[train:val]).data[0]
# val_trace.update(line=dict(color='green'), marker=dict(color='green'), name='Validation Data', showlegend=True)
# fig.add_trace(train_trace)
# fig.add_trace(val_trace)
# fig.update_layout(showlegend=True)
fig.show()

In [None]:
# 19 Plot single point predictions for prediction window
plt.rcParams["figure.figsize"] = (6,2)

feat_pos = 0
start = 0 
window = 24
offset = 0

plt.plot(y_predict[offset:window+offset, feat_pos], label='Hourly (24 steps)')
plt.plot(y_test[:window, feat_pos], label='True')
#plt.plot(y_pred[:,0], label='24 hourly (single step)')
plt.legend()

# RMSE = round(sqrt(mean_squared_error(y_predict[:window,feat_pos], y_test[:window,0])), 2)
# MAE = round(mean_absolute_error(y_predict[:window,feat_pos], y_test[:window,0]), 2)
# l_inf = round(max(np.abs(y_predict[:window,feat_pos] - y_test[:window,0])), 2)

RMSE = round(sqrt(mean_squared_error(y_predict[offset:window + offset,feat_pos], y_test[:window,0])), 2)
MAE = round(mean_absolute_error(y_predict[offset:window+offset,feat_pos], y_test[:window,0]), 2)
l_inf = round(max(np.abs(y_predict[offset:window+offset,feat_pos] - y_test[:window,0])), 2)


#plt.title('RMSE:' + str(RMSE) + ' MAE:' + str(MAE) + ' L_inf:' + str(l_inf))
plt.title('Air Temperature Kew Gardens',fontsize=20)
plt.xlabel('Timesteps (hours)', fontsize=14)
plt.ylabel('Temperature (C)', fontsize=14)


In [None]:
from numpy.core.fromnumeric import size
# 19 Rolling prediction

plt.rcParams["figure.figsize"] = (12,4)

y_pred_scaled = [] 
out_steps = 24
start = 0
current_batch = X_test[start:start+1,:,:] #takes first sample, all windows and all features

for i in range(out_steps):
  
  current_pred = model.predict(current_batch).flatten()
  y_pred_scaled.append(current_pred)
  current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
 
y_pred_scaled = np.array(y_pred_scaled)
y_pred = scaler.inverse_transform(y_pred_scaled)

plt.plot(y_test[:out_steps, 0], label='Measured Temperature', c='orange', alpha=1)
plt.plot(y_pred[:,0], label='24-Hour Multistep Temperature Prediction', c='green', alpha=1)
plt.plot(y_predict[offset:window+offset, feat_pos], label='1-Hour Singleshot Temperature Prediction', c='black', alpha=1)


RMSE_24 = round(sqrt(mean_squared_error(y_pred[offset:window + offset,feat_pos], y_test[:out_steps,0])), 2)
MAE_24 = round(mean_absolute_error(y_pred[offset:window+offset,feat_pos], y_test[:window,0]), 2)
l_inf_24 = round(max(np.abs(y_pred[offset:window+offset,feat_pos] - y_test[:window,0])), 2)


RMSE_1 = round(sqrt(mean_squared_error(y_predict[offset:window+offset, feat_pos], y_test[:window,0])), 2)
MAE_1 = round(mean_absolute_error(y_predict[offset:window+offset, feat_pos], y_test[:window,0]), 2)
l_inf_1 = round(max(np.abs(y_predict[offset:window+offset, feat_pos] - y_test[:window,0])), 2)

plt.ylabel('Temperature (C)', size=16)
plt.xlabel('Time Step (Hour)', size=16)


# plt.title("Singleshot and Multistep Prediction Comparison", size = 22)
plt.legend()

print('total size is',n ,': validation begins at', train, 'and test begins at', val)
'RMSE:' + str(RMSE_24) + ' MAE:' + str(MAE_24) + ' L_inf:' + str(l_inf_24) + 'RMSE:' + str(RMSE_1) + ' MAE:' + str(MAE_1) + ' L_inf:' + str(l_inf_1)

In [None]:
# 20) Input and output single shot prediction
plt.rcParams["figure.figsize"] = (18, 4)
feat_pos = 0

# X_train, y_train = X[:train], y[:train]
# X_val, y_val = X[train:val], y[train:val]
# X_test, y_test = X[val:], y[val:]

# true data and indicies for plotting input window
#window_in = df.iloc[val-window_size:val, 0]
window_in = y_val_inverse[-window_size_hours:, 0]
indices_in = final_df.index[val - window_size_hours:val] 

# indicies for plotting input window
indices_out = final_df.index[val: val + out_steps] 


fig, ax = plt.subplots()

# input
ax.scatter(x=indices_in, y=window_in, label='Context', alpha=0.5, edgecolors='none', s=80) 

# true output
ax.scatter(x=indices_out, y=y_test[:out_steps, feat_pos], label='True',c='orange', alpha=0.5, edgecolors='none', s=80) 

#predicted output
ax.scatter(x=indices_out, y=y_pred[:,feat_pos], label='24-Hour Prediction', c='green', alpha=0.8, edgecolors='none', s=80, marker='x') 

#predicted output
#ax.scatter(x=indices_out, y=naive, label='Baseline Prediction', c='black', alpha=0.4, edgecolors='none', s=80, marker='h') 

RMSE = round(sqrt(mean_squared_error(y_pred[:,feat_pos], y_test[:out_steps, feat_pos])), 2)
MAE = round(mean_absolute_error(y_pred[:, feat_pos], y_test[:out_steps, feat_pos]), 2)
l_inf = round(max(np.abs(y_pred[:,feat_pos] - y_test[:out_steps, feat_pos])), 2)


#plt.title('RMSE:' + str(RMSE) + ' , MAE:' + str(MAE) + ' , L_inf:' + str(l_inf))
plt.title('Air Temperature Kew Gardens', fontsize=26)
plt.xlabel('Date', fontsize=20)
plt.ylabel('Temperature (C)', fontsize=20)
plt.xticks(rotation=45, )
import matplotlib.dates as mdates
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H %d/%m/%Y'))


ax.legend(fontsize=12)
plt.show()