In [1]:
# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")

# Basics:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Helpful:
from sklearn.model_selection import train_test_split

# Pipeline and ColumnsTransformer:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
# models:
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor

# my functions:
from src_rf.methods.calc_mean import *
from src_rf.methods.calc_weights import *
from src_rf.methods.calc_dist import *

### 1. Load Data

In [2]:
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv"
                 , index_col = 'datetime', parse_dates=True)

### 2. Train Test Split

In [3]:
X = df.drop('total_energy_usage', axis = 1)
y = df['total_energy_usage']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3 ,shuffle=False, random_state=42)

### 2. Load Weights

In [5]:
import os
from scipy.sparse import load_npz
from scipy.sparse import vstack

def load_sparse_matrices_from_dir(dir_path):
    """Load all sparse matrices from a directory."""
    tree_files = sorted([os.path.join(dir_path, file) for file in os.listdir(dir_path)])
    return [load_npz(file) for file in tree_files]

def load_all_rf_weights(base_dir, num_batches, num_trees):
    """Load and combine all rf weights."""
    all_rf_weights = [[] for _ in range(num_trees)]

    for batch_idx in range(num_batches):
        batch_dir = os.path.join(base_dir, f"batch_{batch_idx}")
        batch_weights = load_sparse_matrices_from_dir(batch_dir)

        for tree_idx, weights in enumerate(batch_weights):
            all_rf_weights[tree_idx].append(weights)

    # Now concatenate all batches for each tree
    for tree_idx in range(num_trees):
        all_rf_weights[tree_idx] = vstack(all_rf_weights[tree_idx])

    return all_rf_weights

# Before the Usage section:
batch_size = 500  # This was the batch size you've defined earlier
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv", index_col="datetime")
X = df.drop('total_energy_usage', axis=1).values
_, X_test, _, _ = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
num_samples = X_test.shape[0]

# Usage
base_dir = "/Data/Delong_BA_Data/rf_weights/"
num_batches = (num_samples + batch_size - 1) // batch_size  # ceiling division to find number of batches
# num_batches = 2
num_trees = 300  # based on your RandomForestRegressor setup

rf_weights_loaded = load_all_rf_weights(base_dir, num_batches, num_trees)

### 3. Random Forest

In [10]:
# 3.1 Parameters for Weight_Calculation:
bootstrap = True
max_samples = 0.5
# 3.2 Parameters for RF
n_estimators = 300
min_samples_split = 5
min_samples_leaf = 5
max_depth = 40.0

# 3.3 Model Training
rf = RandomForestRegressor(
    bootstrap=bootstrap, max_samples=max_samples,n_estimators = n_estimators, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_depth = max_depth, verbose=0, n_jobs=-1, random_state = 42
)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=40.0, max_samples=0.5, min_samples_leaf=5,
                      min_samples_split=5, n_estimators=300, n_jobs=-1,
                      random_state=42)

### 4. Sanity Check

In [11]:
type(rf_weights_loaded)

list

In [12]:
len(rf_weights_loaded)

300

In [13]:
print(type(rf_weights_loaded[0]))

<class 'scipy.sparse.csr.csr_matrix'>


In [14]:
rf_mean_weights = calc_mean_rf(rf_weights_loaded, y_train)

In [15]:
rf_mean_weights = calc_mean_rf(rf_weights_loaded, y_train)
rf_mean_normal = rf.predict(X_test)

In [16]:
# Are the two the same?
sum(np.round(rf_mean_weights, 5) == np.round(rf_mean_normal[0:rf_weights_loaded[0].shape[0]], 5))

20456