<a href="https://colab.research.google.com/github/dr-adsalas/TRIOUB/blob/main/MLTRIO_Thousand_Oaks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
# URL for the dataset
url = "https://raw.githubusercontent.com/dr-adsalas/TRIOUB/main/TOTRIO2024.csv"

# Load data directly from GitHub
data = pd.read_csv(url)

# Inspect the first few rows
print(data.head())
print(data.info())

       Date   Time            date_time altimeter air_temp relative_humidity  \
0  12/31/23  16:00  2023-12-31 16:00:00     30.03       57                78   
1  12/31/23  16:30  2023-12-31 16:30:00     30.03       56                79   
2  12/31/23  16:45  2023-12-31 16:45:00     30.02       55                80   
3  12/31/23  17:00  2023-12-31 17:00:00     30.02       54                82   
4  12/31/23  17:30  2023-12-31 17:30:00     30.03       53                85   

  wind_speed wind_direction  
0          0            NaN  
1          0            NaN  
2          0            NaN  
3          0            NaN  
4          0            NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26114 entries, 0 to 26113
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Date               26113 non-null  object
 1   Time               26113 non-null  object
 2   date_time          26113 non-null  obj

In [None]:
# Convert date_time column to datetime
data['date_time'] = pd.to_datetime(data['date_time'])

# Convert numerical columns to floats
numeric_cols = ['altimeter', 'air_temp', 'relative_humidity', 'wind_speed',]
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values in key columns
data = data.dropna(subset=['air_temp', 'relative_humidity', 'wind_speed'])

In [None]:
# Create time-based features
data['Hour'] = data['date_time'].dt.hour
data['DayOfWeek'] = data['date_time'].dt.dayofweek

# Cyclic encoding for periodicity
data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
data['Hour_cos'] = np.cos(2 * np.pi * data['Hour'] / 24)

# Lag features for air temperature
data['Lag_1h'] = data['air_temp'].shift(6)  # Assuming ~10 min intervals, 6 = 1 hour
data['Lag_3h'] = data['air_temp'].shift(18)  # 3 hours
data['Rolling_6h'] = data['air_temp'].rolling(36).mean()  # 6-hour rolling average

# Drop rows with missing lagged/rolling features
data = data.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Hour'] = data['date_time'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DayOfWeek'] = data['date_time'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
A value is trying to be set on a copy of a slice from a 

In [None]:
# Define features and target
X = data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']]
y = data['air_temp']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train the model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}°C")

Mean Absolute Error: 1.22°C


In [None]:
# Get the last rows for prediction (e.g., last 36 rows = 6 hours)
latest_data = data.iloc[-36:]

# Prepare features for prediction
X_latest = latest_data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']]

# Predict
future_temps = model.predict(X_latest)
print(f"Predicted temperatures: {future_temps}")

Predicted temperatures: [65.16 65.67 65.21 65.2  65.82 65.96 66.09 66.13 66.78 67.57 73.24 73.82
 74.63 75.84 76.08 76.86 77.66 77.89 79.01 79.25 79.31 80.   79.05 78.58
 67.68 67.87 67.91 67.91 67.86 67.67 67.08 66.88 66.45 66.14 65.48 66.43]


In [None]:
# Create a DataFrame to associate predictions with their timestamps
future_dates = pd.date_range(start=data['date_time'].iloc[-1], periods=len(future_temps), freq='10min')
future_df = pd.DataFrame({'Timestamp': future_dates, 'Predicted_Temperature': future_temps})

# Resample to daily frequency and compute max and min temperatures
daily_stats = future_df.resample('D', on='Timestamp').agg(
    Max_Temperature=('Predicted_Temperature', 'max'),
    Min_Temperature=('Predicted_Temperature', 'min')
)

# Display daily maximum and minimum temperatures
print(daily_stats)

            Max_Temperature  Min_Temperature
Timestamp                                   
2024-12-07             80.0            65.16


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Correct URL for raw data
data_url = "https://raw.githubusercontent.com/dr-adsalas/TRIOUB/main/TOTRIO2024.csv"
data = pd.read_csv(data_url)

In [None]:
# Convert 'air_temp', 'relative_humidity', 'wind_speed', 'sea_level_pressure' to numeric, coercing errors
data['air_temp'] = pd.to_numeric(data['air_temp'], errors='coerce')
data['relative_humidity'] = pd.to_numeric(data['relative_humidity'], errors='coerce')
data['wind_speed'] = pd.to_numeric(data['wind_speed'], errors='coerce')
data['date_time'] = pd.to_datetime(data['date_time'], errors='coerce')

In [None]:
# Create time-based features
data['Hour'] = data['date_time'].dt.hour
data['DayOfWeek'] = data['date_time'].dt.dayofweek

# Cyclic encoding for periodicity
data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
data['Hour_cos'] = np.cos(2 * np.pi * data['Hour'] / 24)

In [None]:
# Lag features for air temperature
data['Lag_1h'] = data['air_temp'].shift(6)  # Assuming ~10 min intervals, 6 = 1 hour
data['Lag_3h'] = data['air_temp'].shift(18)  # 3 hours
data['Rolling_6h'] = data['air_temp'].rolling(36).mean()  # 6-hour rolling average

# Drop rows with missing lagged/rolling features
data = data.dropna()

In [None]:
# Assuming 'model' is your trained model to predict temperatures
# Use the same features for prediction as used during training
future_temps = model.predict(data[['Lag_1h', 'Lag_3h', 'Rolling_6h', 'Hour_sin', 'Hour_cos', 'relative_humidity', 'wind_speed']])

# Create DataFrame with prediction timestamps and temperatures
future_dates = pd.date_range(start=data['date_time'].iloc[-1], periods=len(future_temps), freq='10min')
future_df = pd.DataFrame({'Timestamp': future_dates, 'Predicted_Temperature': future_temps})

In [None]:
# Resample to daily frequency and compute max and min temperatures for the next 3 days
daily_stats = future_df.resample('D', on='Timestamp').agg(
    Max_Temperature=('Predicted_Temperature', 'max'),
    Min_Temperature=('Predicted_Temperature', 'min')
)

# Get predictions for 12/8/2024, 12/9/2024, and 12/10/2024
target_days = ['2024-12-08', '2024-12-09', '2024-12-10']
predictions_for_days = daily_stats.loc[target_days]

# Print the predictions
print(predictions_for_days)

            Max_Temperature  Min_Temperature
Timestamp                                   
2024-12-08            73.21            51.59
2024-12-09            79.27            46.29
2024-12-10            75.75            50.42


In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}°C")

Mean Absolute Error: 1.22°C
