In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression
import time
import cudf
from cuml.ensemble import RandomForestRegressor as cuRF
import math
from tqdm import tqdm
import xgboost as xgb

#### Output file

In [2]:
sourceFile = open("gpu_logs/shared_meteo.txt", 'w')

### Load the data

In [3]:
df = pd.read_csv("data/shared_meteo.csv")
df.reset_index(drop=True, inplace=True)
df['Time'] =  pd.to_datetime(df['Time'])
df

Unnamed: 0,Time,Energy,Location,Temp_Med,Temp_Max,Temp_Min,Rumo_Vento_Med,Rumo_Vento_Max,Intensidade_Vento_Med,Intensidade_Vento_Max,Precip,Rad_Total
0,2020-10-06 00:15:00,0.076,PT41CV,11.10,11.20,11.00,285.0,287.0,3.10,4.40,0.00,-990.0
1,2020-10-06 00:30:00,0.072,PT41CV,11.00,11.15,10.85,295.5,298.5,2.85,3.85,0.00,-990.0
2,2020-10-06 00:45:00,0.072,PT41CV,11.30,11.50,11.00,260.0,276.0,2.00,3.00,0.00,-990.0
3,2020-10-06 01:00:00,0.068,PT41CV,11.45,11.60,11.20,196.0,211.0,1.70,2.55,0.00,-990.0
4,2020-10-06 01:15:00,0.032,PT41CV,10.80,11.10,10.60,162.0,191.0,1.30,1.70,0.00,-990.0
...,...,...,...,...,...,...,...,...,...,...,...,...
576694,2022-11-25 00:00:00,0.764,PT87ZW,12.65,12.70,12.55,159.0,139.0,2.35,3.65,0.05,0.0
576695,2022-11-25 00:15:00,0.660,PT87ZW,12.60,12.70,12.60,160.0,158.0,1.90,3.60,0.00,0.0
576696,2022-11-25 00:30:00,0.692,PT87ZW,12.60,12.70,12.50,156.5,153.5,2.20,3.25,0.00,0.0
576697,2022-11-25 00:45:00,0.660,PT87ZW,12.60,12.70,12.40,147.0,172.0,2.00,3.30,0.00,0.0


In [None]:
## Number of houses
df.Location.nunique()

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
## Number of records per house
df.Location.value_counts()

In [None]:
#df.to_csv("../datasets/shared_complete.csv", index=False)

### Auxiliary functions

In [None]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()

def truncate_metric(metric):
    m = math.trunc(10000 * metric) / 10000
    return m 
    
def performance_metrics(preds: np.array, actuals: np.array, filename):

    # calculate performance metrics
    
    mse = truncate_metric(mean_squared_error(actuals, preds))
    wape = truncate_metric(np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals))) * 100
    r2 = truncate_metric(r2_score(actuals, preds))
    
    # print performance metrics
    print('MSE: %.4f' % mse, file=filename)
    print('WAPE: %.2f' % wape, file=filename)
    print('R2: %.4f' % r2, file=filename)
    return mse, wape, r2

#@jit(target_backend='cuda')
def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return model, preds

def validate(estimator, X_train, y_train):
    scores = cross_validate(estimator, X_train, y_train, scoring=['r2', 'neg_mean_squared_error'])
    return scores

In [None]:
def total_averaged_metrics(metrics_list, filename):
    
    print("Total Averaged MSE: {}".format(np.round(sum(i for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged WAPE: {}".format(np.round(sum(j for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)
    print("Total Averaged R2: {}".format(np.round(sum(k for i, j, k in metrics_list)/len(metrics_list),3)), file=filename)

def past_timesteps(df, number_of_timesteps):
    for i in tqdm(range(1, (number_of_timesteps + 1))):
        df.loc[df['Time'].shift(i) == df['Time'] - pd.Timedelta(i * 15, 'm'), f"energy_lag_{i}"] = df['Energy'].shift(i)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# def last_energy_points_full(df, number_timesteps, num_samples_per_house):
#     X = pd.DataFrame()
#     other_feats = df.iloc[:,:2]
#     for i in range(1, (number_timesteps + 1) ):
#         X[f'Energy_{i*15}'] = df['Energy'].shift(i)
#     y = df.copy().iloc[:,2]
#     y.iloc[:number_timesteps] = np.nan
#     ## Remove samples in between each house
#     for h in range(1, number_of_houses):
#         for i in range(0, number_timesteps):
#             X.iloc[(num_samples_per_house+i)*h] = np.nan
#             y.iloc[(num_samples_per_house+i)*h] = np.nan
#     X = pd.concat([other_feats, X], axis=1)
#     X.dropna(inplace=True)
#     X.reset_index(drop=True, inplace=True)
#     y.dropna(inplace=True)
#     y.reset_index(drop=True, inplace=True)
#     y.columns = ["Energy"]
#     dataframe = pd.concat([X,y.rename('Energy')], axis=1)
#     assert number_of_houses == (df.shape[0] - dataframe.shape[0]) / number_timesteps, "Something went wrong with preprocessing"
#     return dataframe
    

In [None]:
def normalize_training(X_train):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train, scaler

### Cross Validate

In [None]:
#scores = validate(model, X_train, y_train)
#cv_mse = np.round(scores['test_neg_mean_squared_error'].mean() * (-1),4)
#cv_r2 = np.round(scores['test_r2'].mean(),5)
#print("CV MSE: {} ".format(cv_mse))
#print("CV R2: {} ".format(cv_r2))
#metrics_list.append((cv_mse,cv_rmse,cv_mae,mape,cv_r2))

## Leave 10 houses for test (demonstration)

In [None]:
number_of_timesteps = 672

In [None]:
locations = np.random.choice(df['Location'].unique(), size=3, replace=False)
locations

In [None]:
df_new = past_timesteps(df, number_of_timesteps)
df_new

In [None]:
df_new['DayOfWeek'] = df_new['Time'].dt.dayofweek
df_new['Weekend'] = df_new['Time'].dt.dayofweek.isin([5,6]).astype(int)
df_new['Hour'] = df_new['Time'].dt.hour
df_new

In [None]:
df_new.to_csv("data/shared_oneweek_meteo.csv", index=False)

In [None]:
def test_leave_house_out(df, estimator, locations, filename):
    test = df[df['Location'].isin(locations)]
    train = df[~df['Location'].isin(locations)]
    print("Train set: ", train.shape)
    print("Test set: ", test.shape)
    X_train = train.drop(['Time', 'Energy', 'Location'], axis=1)
    X_test = test.drop(['Time', 'Energy', 'Location'], axis=1)
    y_train = train['Energy']
    y_test = test['Energy']
    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    model = estimator
    init = time.time()
    model.fit(X_train_norm, y_train)
    y_pred = model.predict(X_test_norm)
    end = time.time()
    print('Elapsed time: {:.4f} s'.format(end - init), file=filename)
    mse, wape, r2 = performance_metrics(y_pred, y_test.values.reshape(-1), filename)
    return mse, wape, r2, model

## Predict  10 folds with 10 random houses for test

In [None]:
df = pd.read_csv("data/shared_oneweek_meteo.csv")
df

In [None]:
df.isna().sum().sum()

In [None]:
num_houses_test = 3

In [None]:
locations = []
for i in range(10):
    np.random.seed(i*4)
    l = np.random.choice(df['Location'].unique(), size=num_houses_test, replace=False)
    locations.append(l)
locations

### Linear Regression

In [None]:
metrics_list_lr = []
print("\n----------------------------", file=sourceFile)
print("\nLinear Regression\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_lr = test_leave_house_out(df, LinearRegression(), locations[i], sourceFile)
    metrics_list_lr.append((mse, wape, r2))

### XGBoost

In [None]:
metrics_list_xgb = []
print("\n----------------------------", file=sourceFile)
print("\nXGBoost\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_xgb = test_leave_house_out(df, xgb.XGBRegressor(tree_method='gpu_hist', seed=0), locations[i], sourceFile)
    metrics_list_xgb.append((mse, wape, r2))

### Random Forest

In [None]:
metrics_list_rf = []
print("\n----------------------------", file=sourceFile)
print("\nRandom Forest\n", file=sourceFile)
print("----------------------------\n", file=sourceFile)
for i in range(10):
    print("\nIteration", i, file=sourceFile)
    mse, wape, r2, model_rf = test_leave_house_out(df, cuRF(), locations[i], sourceFile)
    metrics_list_rf.append((mse, wape, r2))

## Averaged Metrics

In [None]:
print("\n\n\n", file=sourceFile)

In [None]:
print("\nLinear Regression", file=sourceFile)
total_averaged_metrics(metrics_list_lr, sourceFile)

In [None]:
print("\nXGBoost", file=sourceFile)
total_averaged_metrics(metrics_list_xgb, sourceFile)

In [None]:
print("\nRandom Forest", file=sourceFile)
total_averaged_metrics(metrics_list_rf, sourceFile)

In [None]:
sourceFile.close()

### Feature Importance

In [None]:
print('Linear Regression\n')
for i,v in enumerate(model_lr.coef_):
    print('Feature: %0d, Score: %.5f' % (i,v))
print('XGBoost\n')
for i,v in enumerate(model_xgb.feature_importances_):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
plt.bar(range(len(model_xgb.feature_importances_)),model_xgb.feature_importances_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
plt.bar(range(len(model_lr.coef_)),model_lr.coef_)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
columns = df.drop(["Energy", "Location", "Time"], axis=1).columns
feature_importance = pd.Series(model_xgb.feature_importances_, index=columns)
feature_importance

In [None]:
features = feature_importance.nlargest(40).index
features

In [None]:
df_40 = df[['Time', 'Location'] + [*features] + ['Energy']]
df_40

In [None]:
df_40.to_csv("data/shared_best40features.csv", index=False)