In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load energy data
energy_data = pd.read_csv('daily_energy_metrics.csv')
energy_data['date'] = pd.to_datetime(energy_data['date'], dayfirst=True, errors='coerce')

# Load temperature data
temp_data = pd.read_csv('temp.csv')
temp_data['date'] = pd.to_datetime(temp_data[['Year', 'Month', 'Day']], errors='coerce')
temp_data = temp_data.rename(columns={'Maximum temperature (Degree C)': 'max_temp'})

# Merge datasets
merged_data = pd.merge(energy_data, temp_data[['date', 'max_temp']], on='date', how='left')
merged_data = merged_data.dropna(subset=['max_temp', 'max_RRP', 'max_GAP', 'max_DEMAND', 'any_condition'])

# Features and target
X = merged_data[['max_RRP', 'max_GAP', 'max_DEMAND', 'max_temp']]
y = merged_data['any_condition'].astype(int)

# Show class distribution
print("Overall class distribution:\n", y.value_counts())

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
clf = RandomForestClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))


Overall class distribution:
 any_condition
0    134
1     10
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        27
           1       1.00      0.50      0.67         2

    accuracy                           0.97        29
   macro avg       0.98      0.75      0.82        29
weighted avg       0.97      0.97      0.96        29

[[27  0]
 [ 1  1]]


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# === Load and preprocess 2023 training data ===
energy_2023 = pd.read_csv('daily_energy_metrics.csv')
energy_2023['date'] = pd.to_datetime(energy_2023['date'], dayfirst=True, errors='coerce')

temp_2023 = pd.read_csv('temp.csv')
temp_2023['date'] = pd.to_datetime(temp_2023[['Year', 'Month', 'Day']], errors='coerce')
temp_2023 = temp_2023.rename(columns={'Maximum temperature (Degree C)': 'max_temp'})

train_df = pd.merge(energy_2023, temp_2023[['date', 'max_temp']], on='date', how='left')
train_df = train_df.dropna(subset=['max_temp', 'max_RRP', 'max_GAP', 'max_DEMAND', 'any_condition'])

X_train = train_df[['max_RRP', 'max_GAP', 'max_DEMAND', 'max_temp']]
y_train = train_df['any_condition'].astype(int)

print("Training label distribution:\n", y_train.value_counts(), "\n")

# === Load and preprocess 2024 test data ===
energy_2024 = pd.read_csv('daily_energy_metrics_2024.csv')
energy_2024['date'] = pd.to_datetime(energy_2024['date'], dayfirst=True, errors='coerce')

temp_2024 = pd.read_csv('temp_2024.csv')
temp_2024['date'] = pd.to_datetime(temp_2024[['Year', 'Month', 'Day']], errors='coerce')
temp_2024 = temp_2024.rename(columns={'Maximum temperature (Degree C)': 'max_temp'})

test_df = pd.merge(energy_2024, temp_2024[['date', 'max_temp']], on='date', how='left')
test_df = test_df.dropna(subset=['max_temp', 'max_RRP', 'max_GAP', 'max_DEMAND'])

X_test = test_df[['max_RRP', 'max_GAP', 'max_DEMAND', 'max_temp']]

has_labels = 'any_condition' in test_df.columns
if has_labels:
    y_test = test_df['any_condition'].astype(int)
    print("Test label distribution:\n", y_test.value_counts(), "\n")

# === Train and predict ===
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Output results ===
if has_labels:
    print("=== 2024 Classification Report ===")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
else:
    print("Predictions for 2024 (no labels provided):")
    results_df = test_df[['date']].copy()
    results_df['predicted_any_condition'] = y_pred
    print(results_df.head())


Training label distribution:
 any_condition
0    134
1     10
Name: count, dtype: int64 

Test label distribution:
 any_condition
0    129
1     15
Name: count, dtype: int64 

=== 2024 Classification Report ===
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       129
           1       1.00      0.87      0.93        15

    accuracy                           0.99       144
   macro avg       0.99      0.93      0.96       144
weighted avg       0.99      0.99      0.99       144

Confusion Matrix:
[[129   0]
 [  2  13]]


In [5]:
# === Identify misclassified dates ===
misclassified = y_test != y_pred
misclassified_dates = test_df.loc[misclassified, 'date']
comparison_df = test_df.loc[misclassified, ['date']].copy()
comparison_df['actual'] = y_test[misclassified].values
comparison_df['predicted'] = y_pred[misclassified]

print("\n=== Misclassified Dates ===")
print(comparison_df)



=== Misclassified Dates ===
          date  actual  predicted
36  2024-06-02       1          0
343 2024-09-12       1          0


In [7]:
import pandas as pd

df = pd.read_hdf('deven_wind.h5', key='data/40922/windspeed-aws_instant_30minute/table')
print(df.columns)
print(df.head())


Index(['index', 'values_block_0', 'values_block_1'], dtype='object')
                 index  values_block_0 values_block_1
0  1672531200000000000        4.611111              N
1  1672533000000000000        4.111111              N
2  1672534800000000000        4.111111              N
3  1672536600000000000        3.611111              N
4  1672538400000000000        4.611111              N


In [8]:
import pandas as pd

# Load raw data
df = pd.read_hdf('deven_wind.h5', key='data/40922/windspeed-aws_instant_30minute/table')

# Rename columns for clarity
df.columns = ['timestamp_ns', 'windspeed', 'flag']

# Convert timestamp from nanoseconds to datetime
df['timestamp'] = pd.to_datetime(df['timestamp_ns'])

# Extract date part
df['date'] = df['timestamp'].dt.date

# Group by date and find daily max windspeed
daily_max = df.groupby('date')['windspeed'].max().reset_index()

# Rename columns
daily_max.columns = ['date', 'max_windspeed']

# Save to CSV
daily_max.to_csv('windspeed_SA1.csv', index=False)

print("Saved daily max windspeed to windspeed_SA1.csv")


Saved daily max windspeed to windspeed_SA1.csv
