In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your data (now using read_csv)
data = pd.read_csv('daily_energy_metrics.csv')

# Select features and target
X = data[['max_RRP', 'max_GAP', 'max_DEMAND']]  # features
y = data['any_condition'].astype(int)  # target (convert from bool to int 0/1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00         3

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74

[[71  0]
 [ 0  3]]


In [41]:
# Load new data (example)
new_data = pd.DataFrame({
    'max_RRP': [90.49, 400],
    'max_GAP': [200, 700],
    'max_DEMAND': [7500, 8000]
    # 'solar': [22,0]
})

# Predict using the trained model
predictions = clf.predict(new_data)

# Show predictions
print(predictions)


[0 0]


In [42]:
import pandas as pd

# Load the 2024 data
data_2024 = pd.read_csv('daily_energy_metrics_2024.csv')

# Select the input features (same as training)
X_2024 = data_2024[['max_RRP', 'max_GAP', 'max_DEMAND']]

# Predict using the already trained model
predictions_2024 = clf.predict(X_2024)

# Add predictions as a new column
data_2024['predicted_any_condition'] = predictions_2024

# Save to new CSV if you want
data_2024.to_csv('daily_energy_metrics_2024_with_predictions.csv', index=False)



In [43]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# True labels from the 2024 file
y_true_2024 = data_2024['any_condition'].astype(int)  # make sure it's int (0/1)

# Compare with the predictions
accuracy = accuracy_score(y_true_2024, predictions_2024)
print(f"Accuracy on 2024 data: {accuracy:.2f}")

# Optional: Full classification report
print(classification_report(y_true_2024, predictions_2024))

# Optional: Confusion matrix
print(confusion_matrix(y_true_2024, predictions_2024))


Accuracy on 2024 data: 0.96
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       322
           1       1.00      0.64      0.78        45

    accuracy                           0.96       367
   macro avg       0.98      0.82      0.88       367
weighted avg       0.96      0.96      0.95       367

[[322   0]
 [ 16  29]]


In [44]:
# First, add the predictions to the 2024 dataframe if not already done
data_2024['predicted_any_condition'] = predictions_2024

# Find rows where prediction != actual
wrong_predictions = data_2024[data_2024['predicted_any_condition'] != data_2024['any_condition']]

# Show the dates and the mismatch
print(wrong_predictions[['date', 'any_condition', 'predicted_any_condition']])


           date  any_condition  predicted_any_condition
144  2024-05-24           True                        0
155  2024-06-04           True                        0
166  2024-06-15           True                        0
168  2024-06-17           True                        0
169  2024-06-18           True                        0
171  2024-06-20           True                        0
172  2024-06-21           True                        0
173  2024-06-22           True                        0
174  2024-06-23           True                        0
175  2024-06-24           True                        0
178  2024-06-27           True                        0
184  2024-07-03           True                        0
189  2024-07-08           True                        0
192  2024-07-11           True                        0
193  2024-07-12           True                        0
231  2024-08-19           True                        0


In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load your data (now using read_csv)
data = pd.read_csv('daily_energy_metrics.csv')

# Select features and target
X = data[['max_RRP', 'max_GAP', 'max_DEMAND']]  # features
y = data['any_condition'].astype(int)  # target (convert from bool to int 0/1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.svm import SVC

# Create the SVM classifier
svm_clf = SVC(kernel='rbf', random_state=42)

# Train the model
svm_clf.fit(X_train, y_train)

# Predict on the test set
svm_predictions = svm_clf.predict(X_test)

# Evaluate the SVM model
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print("SVM Accuracy on test data:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))
print(confusion_matrix(y_test, svm_predictions))



SVM Accuracy on test data: 0.972972972972973
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.33      0.50         3

    accuracy                           0.97        74
   macro avg       0.99      0.67      0.74        74
weighted avg       0.97      0.97      0.97        74

[[71  0]
 [ 2  1]]


In [46]:
# Predict using the trained SVM model
svm_predictions_2024 = svm_clf.predict(X_2024)

data_2024['predicted_any_condition'] = svm_predictions_2024

# Save to new CSV if you want
data_2024.to_csv('SVM_daily_energy_metrics_2024_with_predictions.csv', index=False)

# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
print("SVM Accuracy on 2024 data:", accuracy_score(y_true_2024, svm_predictions_2024))

# Full classification report
print(classification_report(y_true_2024, svm_predictions_2024))

# Confusion matrix
print(confusion_matrix(y_true_2024, svm_predictions_2024))


SVM Accuracy on 2024 data: 0.9264305177111717
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       322
           1       1.00      0.40      0.57        45

    accuracy                           0.93       367
   macro avg       0.96      0.70      0.77       367
weighted avg       0.93      0.93      0.91       367

[[322   0]
 [ 27  18]]


In [47]:
# (Assuming you already predicted with SVM and have svm_predictions_2024)

# Add the SVM predictions to your 2024 dataframe
data_2024['svm_predicted_any_condition'] = svm_predictions_2024

# Find the rows where SVM prediction != actual label
wrong_svm_predictions = data_2024[data_2024['svm_predicted_any_condition'] != data_2024['any_condition']]

# Show the dates, actual, and predicted values
print(wrong_svm_predictions[['date', 'any_condition', 'svm_predicted_any_condition']])


           date  any_condition  svm_predicted_any_condition
52   2024-02-22           True                            0
144  2024-05-24           True                            0
153  2024-06-02           True                            0
154  2024-06-03           True                            0
155  2024-06-04           True                            0
157  2024-06-06           True                            0
164  2024-06-13           True                            0
166  2024-06-15           True                            0
168  2024-06-17           True                            0
169  2024-06-18           True                            0
170  2024-06-19           True                            0
171  2024-06-20           True                            0
172  2024-06-21           True                            0
173  2024-06-22           True                            0
174  2024-06-23           True                            0
175  2024-06-24           True          

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your data (now using read_csv)
data = pd.read_csv('SA_daily_energy_metrics.csv')

# Select features and target
X = data[['max_RRP', 'max_GAP', 'max_DEMAND']]  # features
y = data['any_condition'].astype(int)  # target (convert from bool to int 0/1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      0.33      0.50         3

    accuracy                           0.93        29
   macro avg       0.96      0.67      0.73        29
weighted avg       0.94      0.93      0.92        29

[[26  0]
 [ 2  1]]


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# === Load & Clean Energy Data ===
energy_data = pd.read_csv('SA_daily_energy_metrics.csv')

# Parse date with dayfirst, drop bad rows
energy_data['date'] = pd.to_datetime(energy_data['date'], dayfirst=True, errors='coerce')
energy_data = energy_data.dropna(subset=['date'])
energy_data = energy_data.drop_duplicates(subset='date')

# === Load & Clean Wind Data ===
wind_data = pd.read_csv('windspeed_SA1.csv')

# Parse date with dayfirst, drop bad rows
wind_data['date'] = pd.to_datetime(wind_data['date'], dayfirst=True, errors='coerce')
wind_data = wind_data.dropna(subset=['date'])
wind_data = wind_data.drop_duplicates(subset='date')

# === Optional: Check date alignment ===
missing_in_wind = set(energy_data['date']) - set(wind_data['date'])
missing_in_energy = set(wind_data['date']) - set(energy_data['date'])

print("Dates in energy not in wind:", missing_in_wind)
print("Dates in wind not in energy:", missing_in_energy)

# === Merge datasets on date ===
merged_data = pd.merge(
    energy_data,
    wind_data[['date', 'max_windspeed']],
    on='date',
    how='inner'
)

print("Merged rows:", len(merged_data))

# === Drop rows with missing required values ===
merged_data = merged_data.dropna(subset=[
    'max_windspeed', 'max_RRP', 'max_GAP', 'max_DEMAND', 'any_condition'
])

print("Final rows after dropna:", len(merged_data))
print("Final class distribution:\n", merged_data['any_condition'].value_counts())

# === Features and Target ===
X = merged_data[['max_RRP', 'max_GAP', 'max_DEMAND', 'max_windspeed']]
y = merged_data['any_condition'].astype(int)

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# === Train Random Forest Model ===
clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X_train, y_train)

# === Predictions & Evaluation ===
y_pred = clf.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))


Dates in energy not in wind: set()
Dates in wind not in energy: set()
Merged rows: 60
Final rows after dropna: 60
Final class distribution:
 any_condition
False    56
True      4
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         1

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12


Confusion Matrix:
[[11  0]
 [ 0  1]]
