<a href="https://colab.research.google.com/github/dr-adsalas/TRIOUB/blob/main/MLTRIO_OX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
# URL for the dataset
url = "https://raw.githubusercontent.com/dr-adsalas/TRIOUB/main/KOXRTRIO2024.csv"

# Load data directly from GitHub
data = pd.read_csv(url)

# Inspect the first few rows
print(data.head())
print(data.info())

       Date   Time            date_time altimeter air_temp relative_humidity  \
0  12/31/23  16:10  2023-12-31 16:10:00     30.05    60.98             69.89   
1  12/31/23  16:51  2023-12-31 16:51:00     30.05    60.08             69.32   
2  12/31/23  16:58  2023-12-31 16:58:00     30.05    60.08             69.32   
3  12/31/23  17:51  2023-12-31 17:51:00     30.06    57.92             80.58   
4  12/31/23  18:51  2023-12-31 18:51:00     30.07    57.02             83.23   

  wind_direction wind_speed sea_level_pressure Unnamed: 9  
0             90       6.91                NaN     1017.6  
1              0          0             1017.6     1017.6  
2            220       3.45                NaN     1017.6  
3            140       3.45             1017.9    1017.95  
4              0          0             1018.1    1018.29  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11109 entries, 0 to 11108
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
--

In [None]:
# Convert date_time column to datetime
data['date_time'] = pd.to_datetime(data['date_time'])

# Convert numerical columns to floats
numeric_cols = ['altimeter', 'air_temp', 'relative_humidity', 'wind_speed', 'sea_level_pressure']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values in key columns
data = data.dropna(subset=['air_temp', 'relative_humidity', 'wind_speed'])

In [None]:
# Create time-based features
data['Hour'] = data['date_time'].dt.hour
data['DayOfWeek'] = data['date_time'].dt.dayofweek

# Cyclic encoding for periodicity
data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
data['Hour_cos'] = np.cos(2 * np.pi * data['Hour'] / 24)

# Lag features for air temperature
data['Lag_1h'] = data['air_temp'].shift(6)  # Assuming ~10 min intervals, 6 = 1 hour
data['Lag_3h'] = data['air_temp'].shift(18)  # 3 hours
data['Rolling_6h'] = data['air_temp'].rolling(36).mean()  # 6-hour rolling average

# Drop rows with missing lagged/rolling features
data = data.dropna()

In [None]:
# Define features and target
X = data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']]
y = data['air_temp']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train the model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}°C")

Mean Absolute Error: 1.23°C


In [None]:
# Get the last rows for prediction (e.g., last 36 rows = 6 hours)
latest_data = data.iloc[-36:]

# Prepare features for prediction
X_latest = latest_data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']]

# Predict
future_temps = model.predict(X_latest)
print(f"Predicted temperatures: {future_temps}")

Predicted temperatures: [53.213  53.2526 50.8946 49.9424 49.604  48.9776 49.9424 48.9236 47.1308
 44.9726 46.9148 45.3938 47.2766 53.96   59.603  65.093  60.9296 69.2996
 70.0124 69.9566 64.9508 59.4716 56.8706 55.2092 55.9382 54.1994 53.4938
 50.9378 53.969  58.9874 61.5542 58.0388 61.1744 57.7742 57.7778 57.4304]


In [None]:
# Create a DataFrame to associate predictions with their timestamps
future_dates = pd.date_range(start=data['date_time'].iloc[-1], periods=len(future_temps), freq='10min')
future_df = pd.DataFrame({'Timestamp': future_dates, 'Predicted_Temperature': future_temps})

# Resample to daily frequency and compute max and min temperatures
daily_stats = future_df.resample('D', on='Timestamp').agg(
    Max_Temperature=('Predicted_Temperature', 'max'),
    Min_Temperature=('Predicted_Temperature', 'min')
)

# Display daily maximum and minimum temperatures
print(daily_stats)

            Max_Temperature  Min_Temperature
Timestamp                                   
2024-12-07          70.0124          44.9726


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Correct URL for raw data
data_url = "https://raw.githubusercontent.com/dr-adsalas/TRIOUB/main/KOXRTRIO2024.csv"
data = pd.read_csv(data_url)

In [None]:
# Convert 'air_temp', 'relative_humidity', 'wind_speed', 'sea_level_pressure' to numeric, coercing errors
data['air_temp'] = pd.to_numeric(data['air_temp'], errors='coerce')
data['relative_humidity'] = pd.to_numeric(data['relative_humidity'], errors='coerce')
data['wind_speed'] = pd.to_numeric(data['wind_speed'], errors='coerce')
data['sea_level_pressure'] = pd.to_numeric(data['sea_level_pressure'], errors='coerce')
data['date_time'] = pd.to_datetime(data['date_time'], errors='coerce')

In [None]:
# Create time-based features
data['Hour'] = data['date_time'].dt.hour
data['DayOfWeek'] = data['date_time'].dt.dayofweek

# Cyclic encoding for periodicity
data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
data['Hour_cos'] = np.cos(2 * np.pi * data['Hour'] / 24)

In [None]:
# Lag features for air temperature
data['Lag_1h'] = data['air_temp'].shift(6)  # Assuming ~10 min intervals, 6 = 1 hour
data['Lag_3h'] = data['air_temp'].shift(18)  # 3 hours
data['Rolling_6h'] = data['air_temp'].rolling(36).mean()  # 6-hour rolling average

# Drop rows with missing lagged/rolling features
data = data.dropna()

In [None]:
# Assuming 'model' is your trained model to predict temperatures
# Use the same features for prediction as used during training
future_temps = model.predict(data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']])

# Create DataFrame with prediction timestamps and temperatures
future_dates = pd.date_range(start=data['date_time'].iloc[-1], periods=len(future_temps), freq='10min')
future_df = pd.DataFrame({'Timestamp': future_dates, 'Predicted_Temperature': future_temps})

In [None]:
# Resample to daily frequency and compute max and min temperatures for the next 3 days
daily_stats = future_df.resample('D', on='Timestamp').agg(
    Max_Temperature=('Predicted_Temperature', 'max'),
    Min_Temperature=('Predicted_Temperature', 'min')
)

# Get predictions for 12/8/2024, 12/9/2024, and 12/10/2024
target_days = ['2024-12-08', '2024-12-09', '2024-12-10']
predictions_for_days = daily_stats.loc[target_days]

# Print the predictions
print(predictions_for_days)

            Max_Temperature  Min_Temperature
Timestamp                                   
2024-12-08          71.7980          39.2828
2024-12-09          63.7196          43.8422
2024-12-10          65.5682          48.6464


In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}°C")

Mean Absolute Error: 1.23°C
