In [251]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score


In [160]:
def extract_datetime_features(df):
    df['hour'] = df['local_15min'].dt.hour
    df['minute'] = df['local_15min'].dt.minute
    df['day_of_week'] = df['local_15min'].dt.dayofweek
    df['day_of_month'] = df['local_15min'].dt.day
    df['month'] = df['local_15min'].dt.month
    df['weekend'] = df['local_15min'].dt.dayofweek >= 5  # True if it's a weekend (Saturday or Sunday)
    df['ev_present'] = df['ev_car'].apply(lambda x: 1 if x >=1 else 0)  # True if there is an electric vehicle present
    return df

In [222]:
def preprocess_data(df):
    # Ensure the datetime column is in the correct format
    df['local_15min'] = pd.to_datetime(df['local_15min'], utc=True)

    # Extract useful features from datetime
    df = extract_datetime_features(df)

    # Drop the original datetime column (or keep it if necessary)
    df = df.drop(columns=['local_15min', 'ev_car', 'solar', 'grid', 'total_usage'])

    # Encode categorical columns (if 'house_type', 'day_of_the_week', etc. are categorical)
    label_encoders = {}
    for column in ['house_type']:  # Add other categorical columns as needed
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

    # Extract features and label
    y = df['ev_present'].values
    df.drop(columns=['ev_present'], inplace=True)
    X = df.values  # Features (everything except 'ev_present')

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    # Standardize the features (important for some models)
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [242]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    # Initialize the Decision Tree model
    clf = DecisionTreeClassifier(max_depth=5, random_state=42)

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions on the train and test sets
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    # Evaluate the model
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)

    # Precision, recall, and F1-score
    precision = precision_score(y_test, y_pred_test, average='binary', zero_division=0)
    recall = recall_score(y_test, y_pred_test, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred_test, average='binary', zero_division=0)

    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f"Cross-validation scores: {scores}")

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nDetailed Classification Report:\n", classification_report(y_test, y_pred_test))


In [243]:
def main(df):
    # Preprocess the data
    X_train, X_test, y_train, y_test = preprocess_data(df)

    # Train and evaluate the Decision Tree model
    train_decision_tree(X_train, y_train, X_test, y_test)

In [248]:
source = 'pecan_street'
area = 'austin'
house = 6139
df = pd.read_csv(f'../../Dataset/{source}/{area}/house_energy_compressed/{house}_compressed.csv', parse_dates=['local_15min'])

In [None]:
main(df)