1. Loading and processing data 

In [3]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
import numpy as np

In [4]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
params = {
    # melbourne coordinates
	"latitude": -37.814,
	"longitude": 144.9633,
	"start_date": "2020-01-01",
	"end_date": "2024-12-25",
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "sunrise", "sunset", "uv_index_max", "precipitation_sum", "wind_speed_10m_max", "wind_gusts_10m_max"],
	"timezone": "Australia/Sydney"
}
responses = openmeteo.weather_api(url, params=params)

In [None]:
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

In [6]:
# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()
daily_sunrise = daily.Variables(3).ValuesAsNumpy()
daily_sunset = daily.Variables(4).ValuesAsNumpy()
daily_uv_index_max = daily.Variables(5).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(6).ValuesAsNumpy()
daily_wind_speed_10m_max = daily.Variables(7).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(8).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
daily_data["sunrise"] = daily_sunrise
daily_data["sunset"] = daily_sunset
daily_data["uv_index_max"] = daily_uv_index_max
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_data["ave_temp"] = (daily_temperature_2m_max + daily_temperature_2m_min)/2

daily_dataframe = pd.DataFrame(data = daily_data)

2. Cleaning data

In [7]:
# input features: sunrise, sunset, uv, precipitation, wind speed and gust
features = daily_dataframe[["sunrise", "sunset", "uv_index_max", "precipitation_sum", "wind_speed_10m_max", "wind_gusts_10m_max", "ave_temp", "weather_code"]]

# cannot use rows with missing features, so remove
missing_rows = features[features.isna().any(axis=1)]

# only train model using rows that contain all features
valid_rows = features[features.notna().all(axis=1)]

3a. Predicting temperature

In [8]:
# use cross validation to train, test and evaluate the model
X = valid_rows[["uv_index_max", "precipitation_sum", "wind_speed_10m_max", "wind_gusts_10m_max"]]
Y = valid_rows["ave_temp"]

In [None]:
# model 0: baseline model mean

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
mean_prediction = np.mean(Y_train)
baseline_predictions = np.full_like(Y_test, mean_prediction, dtype=np.float64)
mse = mean_squared_error(Y_test, baseline_predictions)

print(f"Baseline Model Mean Squared Error: {mse:.4f}")

In [None]:
# model 1: simple linear regression
model1 = LinearRegression()

# Set up cross-validation (e.g., 5-fold cross-validation)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model1, X, Y, cv=cv, scoring='r2')
predictions = cross_val_predict(model1, X, Y, cv=cv)

# Print cross-validation results
print("Cross-validation scores: ", scores)
print("Mean r2: ", np.mean(scores))
print("Standard deviation: ", np.std(scores))

results1 = pd.DataFrame({'Actual Values': Y, 'Predicted Values': predictions})
print(results1)


In [None]:
# model 2: random forest regressor
model2 = RandomForestRegressor()

# Set up cross-validation (e.g., 5-fold cross-validation)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model2, X, Y, cv=cv, scoring='r2')
predictions = cross_val_predict(model2, X, Y, cv=cv)

# Print cross-validation results
print("Cross-validation scores: ", scores)
print("Mean r2: ", np.mean(scores))
print("Standard deviation: ", np.std(scores))

results2 = pd.DataFrame({'Actual Values': Y, 'Predicted Values': predictions})
print(results2)

In [None]:
# model 3: KNN
model3 = KNeighborsRegressor()

# Set up cross-validation (e.g., 5-fold cross-validation)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model3, X, Y, cv=cv, scoring='r2')
predictions = cross_val_predict(model3, X, Y, cv=cv)

# Print cross-validation results
print("Cross-validation scores: ", scores)
print("Mean r2: ", np.mean(scores))
print("Standard deviation: ", np.std(scores))

results3 = pd.DataFrame({'Actual Values': Y, 'Predicted Values': predictions})
print(results3)

3b. Predicting weather codes

In [13]:
# use cross validation to train, test and evaluate the model
X = valid_rows[["uv_index_max", "precipitation_sum", "wind_speed_10m_max", "wind_gusts_10m_max"]]
Y = valid_rows["weather_code"]

In [None]:
# model 0: Baseline Dummy Classifier

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model0 = DummyClassifier() # uses the most frequent label
model0.fit(X_train, Y_train)
model0_predictions = model0.predict(X_test)
accuracy = accuracy_score(Y_test, model0_predictions)
print(f"Baseline Model Accuracy: {accuracy:.4f}")

print(model0_predictions)


In [None]:
# model 1: Random Forest Classifier

model1 = RandomForestClassifier()

# Set up cross-validation (e.g., 5-fold cross-validation)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model1, X, Y, cv=cv, scoring='accuracy')
predictions = cross_val_predict(model1, X, Y, cv=cv)

# Print cross-validation results
print("Cross-validation scores: ", scores)
print("Mean accuracy: ", np.mean(scores))
print("Standard deviation: ", np.std(scores))

results1 = pd.DataFrame({'Actual Values': Y, 'Predicted Values': predictions})
print(results1)