In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
def extract_datetime_features(df):
    df['hour'] = df['local_15min'].dt.hour
    df['minute'] = df['local_15min'].dt.minute
    df['day_of_week'] = df['local_15min'].dt.dayofweek
    df['day_of_month'] = df['local_15min'].dt.day
    df['month'] = df['local_15min'].dt.month
    df['weekend'] = df['local_15min'].dt.dayofweek >= 5  # True if it's a weekend (Saturday or Sunday)
    df['ev_present'] = df['ev_car'].apply(lambda x: 1 if x >=1 else 0)  # True if there is an electric vehicle present
    return df

In [3]:
def preprocess_data(df):
    # Ensure the datetime column is in the correct format
    df['local_15min'] = pd.to_datetime(df['local_15min'], utc=True)

    # Extract useful features from datetime
    df = extract_datetime_features(df)

    # Drop the original datetime column (or keep it if necessary)
    df = df.drop(columns=['local_15min', 'ev_car', 'solar', 'grid', 'total_usage'])

    # Encode categorical columns (if 'house_type', 'day_of_the_week', etc. are categorical)
    label_encoders = {}
    for column in ['house_type']:  # Add other categorical columns as needed
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

    # Extract features and label
    y = df['ev_present'].values
    df.drop(columns=['ev_present'], inplace=True)
    X = df.values  # Features (everything except 'ev_present')

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    # Standardize the features (important for some models)
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [9]:
def train_random_forest_with_grid_search(X_train, y_train, X_test, y_test):
    # Initialize the Random Forest model
    model = RandomForestClassifier(random_state=42)
    
    # Define hyperparameters to tune
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get the best model from Grid Search
    best_model = grid_search.best_estimator_
    
    # Make predictions on the train and test sets
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    
    # Evaluate the model
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    # Precision, recall, and F1-score
    precision = precision_score(y_test, y_pred_test, average='binary', zero_division=0)
    recall = recall_score(y_test, y_pred_test, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred_test, average='binary', zero_division=0)
    
    # Cross-validation scores
    cv_scores = grid_search.best_score_
    print(f"Cross-validation scores: {cv_scores}")

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nDetailed Classification Report:\n", classification_report(y_test, y_pred_test))
    print("Best parameters found: ", grid_search.best_params_)


In [10]:
def main(df):
    # Preprocess the data
    X_train, X_test, y_train, y_test = preprocess_data(df)

    # Train and evaluate the Decision Tree model
    train_random_forest_with_grid_search(X_train, y_train, X_test, y_test)

In [11]:
source = 'pecan_street'
area = 'austin'

In [12]:
houses = [661, 1642, 4373, 4767, 6139, 8156]
for house in houses:
    df = pd.read_csv(f'../../Dataset/{source}/{area}/house_energy_compressed/{house}_compressed.csv', parse_dates=['local_15min'])
    print(f"\nHouse {house}:")
    main(df)
    print("\n\n")


House 661:
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Cross-validation scores: 0.981502569561427
Training Accuracy: 0.9933
Test Accuracy: 0.9717
Precision: 0.9875
Recall: 0.5651
F1 Score: 0.7188

Detailed Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      8197
           1       0.99      0.57      0.72       561

    accuracy                           0.97      8758
   macro avg       0.98      0.78      0.85      8758
weighted avg       0.97      0.97      0.97      8758

Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}




House 1642:
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Cross-validation scores: 0.966712104824438
Training Accuracy: 0.9872
Test Accuracy: 0.9427
Precision: 0.9817
Recall: 0.4340
F1 Score: 0.6019

Detailed Classification Report:
               precision    recall  f1-score   support

   

In [13]:
houses = [27, 3000, 5679, 9053]
area = 'new_york'
for house in houses:
    df = pd.read_csv(f'../../Dataset/{source}/{area}/house_energy_compressed/{house}_compressed.csv', parse_dates=['local_15min'])
    print(f"\nHouse {house}:")
    main(df)
    print("\n\n")


House 27:
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Cross-validation scores: 0.9913944315049468
Training Accuracy: 0.9925
Test Accuracy: 0.9925
Precision: 0.9653
Recall: 0.9476
F1 Score: 0.9564

Detailed Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4034
           1       0.97      0.95      0.96       382

    accuracy                           0.99      4416
   macro avg       0.98      0.97      0.98      4416
weighted avg       0.99      0.99      0.99      4416

Best parameters found:  {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}




House 3000:
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Cross-validation scores: 0.9818072750842255
Training Accuracy: 0.9952
Test Accuracy: 0.9880
Precision: 0.9446
Recall: 0.9236
F1 Score: 0.9340

Detailed Classification Report:
               precision    recall  f1-score   support

    