In [2]:
# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")

# Basics:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Helpful:
from sklearn.model_selection import train_test_split

# Pipeline and ColumnsTransformer:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# models:
import statsmodels.api as sm

# my functions:
from src_rf.methods.calc_mean import *
from src_rf.methods.calc_weights import *
from src_rf.methods.calc_dist import *
from src_rf.datasets.load_weights_energy import * 

### 0. Setup

In [3]:
def quantile_loss(y_true, y_pred, tau):
    return max(tau * (y_true - y_pred), (1 - tau) * (y_pred - y_true))

In [4]:
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]

### 1. Load Data and train test split

In [5]:
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv"
                 , index_col = 'datetime', parse_dates=True)

In [6]:
X = df.drop('total_energy_usage', axis = 1)
y = df['total_energy_usage']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3 ,shuffle=False, random_state=42)

In [41]:
X_test.shape

(20456, 11)

### 2. Load Random Forest Weights

In [8]:
rf_weights = load_weights_energy()

In [38]:
cdfs = calc_dist_rf(rf_weights, y_train)

In [61]:
cdfs[2578](min(y_train))

0.0386

In [66]:
min(y_train)

0.0

In [70]:
y_train.shape

(47728,)

In [69]:
np.count_nonzero(y_train)

47722

In [72]:
y = min(y_train)
sum(w for point, w in zip(y_train, test) if point <= y)

0.03858730158730159

In [77]:
for i in tqdm(range(len(cdfs))):
    if cdfs[i](max(y_train)) - 0.025 < 0:
        print(cdfs[i](max(y_train)) - 0.025)

 13%|██████████▌                                                                     | 2706/20456 [00:25<02:45, 107.42it/s]


KeyboardInterrupt: 

In [14]:
avg_weights = sum(weights for weights in rf_weights) / len(rf_weights)

In [58]:
test = avg_weights.getrow(2578).toarray().squeeze()

In [60]:
test[test != 0]

array([0.00792063, 0.00307419, 0.00182407, 0.00033333, 0.0007037 ,
       0.00066667, 0.00419986, 0.00606044, 0.00141667, 0.00627573,
       0.00055556, 0.0187061 , 0.01027421, 0.01667539, 0.01026479,
       0.00529853, 0.00589045, 0.00161905, 0.00114286, 0.00064815,
       0.004848  , 0.00212537, 0.0436711 , 0.03556148, 0.02789702,
       0.01746396, 0.00495374, 0.00178571, 0.0012963 , 0.00677861,
       0.00141667, 0.0051746 , 0.00872102, 0.00122619, 0.00134259,
       0.00744821, 0.00223545, 0.00074074, 0.00033333, 0.00317989,
       0.00252564, 0.00367989, 0.00485618, 0.00188131, 0.00145262,
       0.01458349, 0.00454497, 0.00186508, 0.00060606, 0.00083333,
       0.00424868, 0.00491306, 0.00111111, 0.00030303, 0.00037037,
       0.00586267, 0.00092593, 0.00111111, 0.00030303, 0.00626984,
       0.00157916, 0.00222222, 0.00033333, 0.01008689, 0.00624735,
       0.0022197 , 0.00183217, 0.00233217, 0.00507724, 0.00677417,
       0.00037037, 0.00435012, 0.00149074, 0.00256481, 0.00041

In [37]:
np.count_nonzero(test)

218

In [12]:
rf_weights[0].shape

(20456, 47728)

In [13]:
X_test.shape

(20456, 11)

### 3. Calculate Quantiles:

In [8]:
rf_dist = calc_dist_rf(rf_weights, y_train)

In [10]:
quantile_preds = np.zeros((len(y_test), 5))
for count, q in enumerate(quantiles):
    quantile_preds[:,count] = np.array(calc_quantile_rf(rf_dist,0.5, y_train))