In [1]:
# Dataset
from sklearn.datasets import load_diabetes

# Basics
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Model
from sklearn.ensemble import RandomForestRegressor

# Helpful:
from sklearn.model_selection import train_test_split

# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")
# my functions:
from src.methods.calc_mean import *
from src.methods.calc_weights import *
from src.methods.calc_dist import *

### 1. Load Data

In [2]:
df = pd.read_csv("datasets/energy_data_hourly.csv", index_col="datetime")
df.index = pd.to_datetime(df.index)

In [3]:
# Create the 'weekday' column
df["weekday"] = df.index.day_name()
# Create the 'time' column
df["time"] = df.index.time

In [4]:
df["weekday"] = df["weekday"].astype("category")
df = pd.get_dummies(df, columns=["weekday"], prefix="", prefix_sep="")

df["time"] = df["time"].apply(lambda t: t.hour * 60 + t.minute)

### 2. Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 1:].to_numpy(),
    df["total_energy_usage"].to_numpy(),
    test_size=0.2,
    shuffle=False,
)

### 3. Random Forest

In [6]:
bootstrap = True
max_sample = 0.7

In [7]:
rf = RandomForestRegressor(
    bootstrap=bootstrap, max_samples=max_sample, verbose=0, n_jobs=-1
)

In [8]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_samples=0.7, n_jobs=-1)

### 3. Calculate Weights Random Forest

In [None]:
rf_weights = calc_weights_rf(rf, X_train, X_test, bootstrap, max_sample)

 18%|█████████████▋                                                              | 18/100 [02:12<17:16, 12.64s/it]

In [None]:
np.save("/home/dchen/Random_Forest_Weights/data/rf_weights/energy_data/rf_weights_True_0_7.npy", rf_weights)

### 4. Calc Mean with weights and normal:

In [None]:
rf_weights = np.load("/home/dchen/Random_Forest_Weights/data/rf_weights/energy_data/rf_weights_True_0_7.npy")

In [None]:
rf_mean_weights = calc_mean_rf(rf_weights, y_train)
rf_mean_normal = rf.predict(X_test)

In [None]:
# Are the two the same?
sum(np.round(rf_mean_weights, 5) == np.round(rf_mean_normal, 5))

### 5. Calc Quantile Random Forest

In [None]:
rf_cdfs = calc_dist_rf(rf_weights, y_train)

In [None]:
rf_median = calc_quantile_rf(rf_cdfs, 0.5, y_train)

In [None]:
rf_median[:10]