In [1]:
from google.cloud import bigquery
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
import requests
import json
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
# nvcc --version  ###CUDA version

import pathlib
import shutil
from numpy.random import seed
seed(2)
tf.random.set_seed(2)
from math import sqrt
import os

random_state = 42
# import mlflow
# exp_id = 'weather_dataset'
# mlflow.set_experiment(exp_id)
# mlflow.set_tracking_uri('file:///C:/Users/gabri/VSCode%20Projects/Weather%20Prediction/mlruns')
# mlflow.autolog()
# mlflow.log_param('random_state', random_state)
# export MLFLOW_TRACKING_URI=http://192.168.0.1:5000


In [2]:
# 1) define training, testing and prediction daterange 
# 2) define parameters, feature engineering and transformations, create dataframe 
# 3) convert to numpy and reshape 
# 4) define train, test and val split
# 5) normalize
# 6) define model and log in MLFlow
# 7) model training

In [2]:
# specify project dataset and table
client = bigquery.Client()
project_id = os.environ.get('GOOGLE_CLOUD_PROJECT')
dataset_id = 'weather_api'
table_id = 'union_weather_api_training'

In [3]:
# specify date range
training_start = '2021-08-01 00:00:00'  # Specify the start time with hours, minutes, and seconds
training_end = '2022-08-11 23:01:00'  # Specify the end time with hours, minutes, and seconds
hours_difference = (datetime.datetime.strptime(training_end, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(training_start, '%Y-%m-%d %H:%M:%S')).total_seconds() / 3600
print(f'The expected number of hours between the two dates is: {round(hours_difference)} hours')

date_obj_start = datetime.datetime.strptime(training_start, '%Y-%m-%d %H:%M:%S')  # Include '%H:%M:%S'
date_obj_end = datetime.datetime.strptime(training_end, '%Y-%m-%d %H:%M:%S')  # Include '%H:%M:%S'
unix_start = int(date_obj_start.timestamp())
unix_end = int(date_obj_end.timestamp())

# specify locations
location_1 = 'clerkenwell'
location_2 = 'hadley_wood'
print('Locations: ', location_1, location_2)

The expected number of hours between the two dates is: 9023 hours
Locations:  clerkenwell hadley_wood


In [15]:
query = f"""
SELECT 
    dt,
    city_name,
    temp_c
FROM `{project_id}.{dataset_id}.{table_id}`
where dt > {unix_start} and dt < {unix_end}
order by dt
"""
query_job = client.query(query)
df = query_job.to_dataframe()
print(f'Gets data from {training_start} to {training_end} from {location_1} and {location_2} and merges the two tables')

Gets data from 2021-08-01 00:00:00 to 2022-08-11 23:01:00 from clerkenwell and hadley_wood and merges the two tables


In [16]:
pivot_table = df.pivot(index='dt', columns='city_name', values=['temp_c']).reset_index()
pivot_table.columns.name = None  # Remove the name from the columns
pivot_table.columns = ['dt', 'temp_clerkenwell', 'temp_hadley_wood']
pivot_table.head(2)

Unnamed: 0,dt,temp_clerkenwell,temp_hadley_wood
0,1627776000,16.0,15.0
1,1627779600,16.0,15.0


In [17]:
missing_timestamps = np.diff(pivot_table['dt'])
result_indices = np.where(missing_timestamps != 3600)[0]
result_values = missing_timestamps[result_indices]
print("Indices with non-3600 differences:", result_indices)
print("Values of differences at those indices:", result_values)

Indices with non-3600 differences: []
Values of differences at those indices: []


In [18]:
# create timestamps and data
timestamps_array = pivot_table['dt'].to_numpy()
data_array = pivot_table[['temp_clerkenwell', 'temp_hadley_wood', 'pressure_clerkenwell', 'pressure_hadley_wood']].to_numpy()

KeyError: "['pressure_clerkenwell', 'pressure_hadley_wood'] not in index"

In [19]:
start_date = datetime.datetime.utcfromtimestamp(timestamps_array[0])
end_date = datetime.datetime.utcfromtimestamp(timestamps_array[-1]),

context_hours = 24
prediction_length = 24
dataset_length = len(data_array)
eliminated_timesteps = context_hours - 1

print(f'Dataset starts on {start_date}')
print(f'Dataset ends on {end_date}')
print(f'Original length of dataset: {dataset_length} hours')


print(f'Length of context: {context_hours} hours')
print(f'Number of hours to predict : {prediction_length} hours')
print(f'Given temperature from the last {context_hours} hours, we want to predict temperature over the next {prediction_length} hours')
print(f'In order to create windows, we lose {eliminated_timesteps} hours to create the first window')

new_dataset_length = dataset_length - eliminated_timesteps
window_count = new_dataset_length
print(f'Number of datapoints for training, validation and testing: {new_dataset_length} hours')

Dataset starts on 2021-08-01 00:00:00
Dataset ends on (datetime.datetime(2022, 8, 11, 22, 0),)
Original length of dataset: 9023 hours
Length of context: 24 hours
Number of hours to predict : 24 hours
Given temperature from the last 24 hours, we want to predict temperature over the next 24 hours
In order to create windows, we lose 23 hours to create the first window
Number of datapoints for training, validation and testing: 9000 hours


In [20]:
# 7 Define train, val and test ratios
print(f'Number of indices available to create window, train, validate and test: {dataset_length}')
train = 0.8
val = 0.2
window_start = 0
print(f'Train and validation split: {train} - {val}')
print(f'Eliminated datapoints: 0 - {eliminated_timesteps}')
train_start = eliminated_timesteps
val_start = int((dataset_length-prediction_length)*train)
print(f'Train: {train_start}-{val_start}')
test_start = dataset_length-prediction_length
print(f'Validate: {val_start}-{test_start}')
print(f'Test: {val_start} - {dataset_length}')

Number of indices available to create window, train, validate and test: 9023
Train and validation split: 0.8 - 0.2
Eliminated datapoints: 0 - 23
Train: 23-7199
Validate: 7199-8999
Test: 7199 - 9023


In [None]:
import warnings
warnings.filterwarnings("ignore", message="The behavior of DatetimeProperties.to_pydatetime is deprecated*")

fig = go.Figure()
window_trace = px.scatter(x=datetime_list[window_start:train_start], y=final_df['cl_temp'].iloc[window_start:train_start]).data[0]
window_trace.update(line=dict(color='purple'), marker=dict(color='purple'), name=f'Eliminated indices: {window_start}-{train_start}. Count:{train_start-window_start}', showlegend=True)

# Plot the training data
train_trace = px.scatter(x=datetime_list[train_start:val_start], y=final_df['cl_temp'][train_start:val_start]).data[0]
train_trace.update(line=dict(color='blue'), marker=dict(color='blue'), name=f'Train Data: {train_start}-{val_start}. Count:{val_start-train_start}', showlegend=True)

# Plot the validation data
val_trace = px.scatter(x=datetime_list[val_start:test_start], y=final_df['cl_temp'][val_start:test_start]).data[0]
val_trace.update(line=dict(color='green'), marker=dict(color='green'), name=f'Validation Data: {val_start}-{test_start}. Count:{test_start-val_start}', showlegend=True)

# Plot the test data
test_trace = px.scatter(x=datetime_list[test_start: dataset_length], y=final_df['cl_temp'][test_start: dataset_length]).data[0]
test_trace.update(line=dict(color='red'), marker=dict(color='red'), name=f'Test Data: {test_start}-{dataset_length}. Count:{dataset_length-test_start}', showlegend=True)

# Plot all data
all_trace = px.scatter(x=datetime_list, y=final_df['cl_temp'].to_list()).data[0]
all_trace.update(line=dict(color='yellow'), marker=dict(color='yellow'), name=f'All Data', showlegend=True, mode='lines')

fig.add_trace(window_trace)
fig.add_trace(train_trace)
fig.add_trace(val_trace)
fig.add_trace(test_trace)
fig.add_trace(all_trace)

fig.update_layout(showlegend=True)
fig.show()

In [None]:
# Normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(final_df.iloc[train_start:val_start])
df_scaled = pd.DataFrame(scaler.transform(final_df))
df_scaled.columns = final_df.columns
print(f'We calculate the mean and std of the training data and use this to normalise the entire dataframe')
print(final_df.head(1))
print(df_scaled.head(1))

In [None]:
total_length = len(df_scaled['cl_temp'])
data = df_scaled['cl_temp'].to_numpy()
X = []
y = []

for i in range(total_length - context_hours):
    X.append(data[i:i + context_hours])
    y.append(data[i + context_hours:i + context_hours + 1]) 

X = np.array(X)
y = np.array(y)
X.shape, y.shape

In [None]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
# # import mlflow
# import numpy as np
# from tensorflow import keras
# from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, InputLayer

# feature_count = len(features)
# model = keras.Sequential()

# model.add(InputLayer((context_hours, feature_count)))
# model.add(Bidirectional(LSTM(64, activation='tanh')))
# model.add(Dense(256, 'relu'))
# model.add(Dense(64, 'relu'))
# model.add(Dropout(0.25))
# model.add(Dense(feature_count, 'linear'))

# model.summary()

In [None]:
# from keras.metrics import RootMeanSquaredError
# from keras.optimizers import Adam

# learning_rate = 0.0005
# epochs = 8
# loss = 'mse'
# batch_size = 128
# model.compile(loss=loss, optimizer=Adam(learning_rate = learning_rate), metrics=[RootMeanSquaredError()])
# history = model.fit(X_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=False)
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])

In [None]:
# # scale and get mae
# y_pred = model.predict(X_test)
# y_pred = scaler.inverse_transform(y_pred)
y_test_final = scaler.inverse_transform(y_test)
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# print(f'MAE: {round(mean_absolute_error(y_test_final, y_pred), 2)}')
# print(f'RMSE: {round(sqrt(mean_squared_error(y_test_final, y_pred)), 2)}')

In [None]:
# plt.scatter(x=[i for i in range(len(y_pred))], y=y_test_final.reshape(len(y_test_final),) )
# plt.scatter(x=[i for i in range(len(y_pred))], y=y_pred.reshape(len(y_pred),) )

In [None]:
# model.save('cf_cl.h5')
# model.save('test_model.keras')
# # loaded_model = tf.keras.models.load_model('heath.h5')
# loaded_model

In [None]:
import tensorflow as tf
loaded_model = tf.keras.models.load_model('test_model.keras')

In [None]:
loaded_model.save('test_model.h5')


In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model("test_model_lite")
tflite_model = converter.convert()
with open("model.tflite", "wb") as f:
    f.write(tflite_model)


In [None]:
input_temperature = X_test[0]
true_temperature = y_test_final[0]
predicted_temperature_scaled = loaded_model.predict(input_temperature.reshape(1, input_temperature.shape[0]))
predicted_temperature = scaler.inverse_transform(predicted_temperature_scaled)

print('True temperature: ', true_temperature)
print('predicted_temperature: ', predicted_temperature)
print('Input data: ', scaler.inverse_transform(input_temperature.reshape(input_temperature.shape[0], 1)))