# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LassoCV
from sklearn import tree
from sklearn import metrics 
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import QuantileTransformer
import xgboost as xgb

In [None]:
# load solararray complete data using pandas
features=pd.read_csv('../input/df_solararray_complete.csv')
features.head(3)

# Unarmed: 0 and Location variables are not useful since one if index and another is just name, so let's remove them

In [None]:
features=features.drop(['Unnamed: 0', 'Location'], axis=1)
names=list(features.columns)
features.head(3)

In [None]:
# labels are the values we want to predict
labels=np.array(features['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=features.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
feature_list=list(features.columns)

#convert to numpy array
features=np.array(features)

In [None]:
feature_list

In [None]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)

In [None]:
# let's review the shape of each features
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
def metric_CV():
    # The r2_squared, rmse, mse, and explained variance based on the output of the cross validation
    test_r2 = round(scores['test_r2'].mean(), 3)
    test_rmse = round(math.sqrt(-scores['test_neg_mean_squared_error'].mean()), 3)
    test_mse = round((-scores['test_neg_mean_squared_error'].mean()), 3)
    test_exp_var_mean, test_exp_var_std = round(scores['test_explained_variance'].mean(), 3), round(scores['test_explained_variance'].std()*2, 3)
    test_MAE=round((-scores['test_neg_mean_absolute_error'].mean()), 3)
    
    print('With CV Metrics:',
                  '\nTest R-squared:\t\t\t', test_r2,
                  '\nTest RMSE:\t\t\t', test_rmse,
                  '\nTest MSE:\t\t\t', test_mse,
                  '\nTest Explained Variance:\t {0} (+/- {1})'.format(test_exp_var_mean, test_exp_var_std),
                  '\nTest MAE:\t\t\t', test_MAE)

**MODELS WITHOUT TRANSFORMATION**

In [None]:
################ Linear Regression ######################
# Create linear regression object
lm = linear_model.LinearRegression()

# Train the model using the training sets
lm.fit(train_features, train_labels)

# Make predictions using the testing set
predictions = lm.predict(test_features)
pred_trn=lm.predict(train_features)

# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error
#print("Mean squared error: %.2f"
#      % mean_squared_error(test_labels, prediction))
# Explained variance score: 1 is perfect prediction
#print('Variance score: %.2f' % r2_score(test_labels, prediction))

test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(lm.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################ Linear Regression with Cross Validation ######################
# Create linear regression object
scores = cross_validate(lm, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#scores.keys()
#predictions = cross_val_predict(lm, features, test_labels, cv=10)

print('Linear Regression:')
metric_CV()

# Let's see Feature ranking with recursive feature elimination.

In [None]:
# Feature ranking with recursive feature elimination.
from sklearn.feature_selection import RFE
selector = RFE(lm, 5, step=1)
selector = selector.fit(train_features, train_labels)
selector.n_features_
selector.support_
selector.ranking_

Based on the feature selection with linear model most important features are
['Year',**'Month'**,'Day',**'Hour'**,**'Cloud_Cover_Fraction'**,'Dew_Point',**'Humidity_Fraction'**,'Precipitation','Pressure','Temperature','Visibility','Wind_Speed',**'Solar_Elevation'**]

In [None]:
# select those columns with high importance
selected_features=pd.DataFrame(train_features)
selected_features=selected_features.iloc[:,[1,3,4,6,12]].values
#selected_features

test_features=pd.DataFrame(test_features)
selected_test_features=test_features.iloc[:,[1,3,4,6,12]].values
#test_features

In [None]:
# Train the model using the selected feature as the training sets
lm.fit(selected_features, train_labels)

# Make predictions using the testing set
predictions = lm.predict(test_features)
pred_trn=lm.predict(selected_features)

test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(lm.score(selected_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################### Lasso Regression Model ########################################
lasso = linear_model.Lasso(alpha=10, max_iter=2000)
lasso.fit(train_features, train_labels)
 
# Make predictions using the testing set
predictions = lasso.predict(test_features)
pred_trn=lasso.predict(train_features)

test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(lasso.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('Lasso Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################### Lasso Regression Model with Cross Validation ########################################
lasso_CV = LassoCV(cv=10, random_state=0).fit(features, labels)
#lasso_CV.score(train_features, train_labels) 
scores = cross_validate(lasso_CV, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores.keys()

print('Lasso Regression:')
metric_CV()

In [None]:
################### Ridge Regression ##################################
rdg = Ridge(alpha=1.0)
rdg.fit(train_features, train_labels) 

# Make predictions
predictions=rdg.predict(test_features)
pred_trn=lm.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(rdg.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################## Ridge Regression with Cross Validation ####################
rdg_CV = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(features, labels)
#rdg_CV.score(features, labels) 

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
scores = cross_validate(rdg_CV, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores.keys()

print('Ridge Regression:')
metric_CV()

In [None]:
################## ElasticNet Model #######################
EN = ElasticNet(random_state=0)
EN.fit(train_features, train_labels)
predictions=EN.predict(test_features)

# Make predictions
predictions=EN.predict(test_features)
pred_trn=EN.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(EN.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

#explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################## ElasticNet Regression Model with Cross Validation #######################
EN_CV = ElasticNetCV(cv=10, random_state=0).fit(features, labels)

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
scores = cross_validate(EN_CV, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores.keys()

print('Elastic Net Regression:')
metric_CV()


In [None]:
############### Decision Tree model ######################
dtr = tree.DecisionTreeRegressor()
dtr.fit(train_features, train_labels)

# Make predictions
predictions=dtr.predict(test_features)
pred_trn=dtr.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(dtr.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

#explained_variance_score(train_labels, pred_trn)
print('\nDecision Tree Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
################## Decision Tree Model with Cross Validation #######################
dtr = tree.DecisionTreeRegressor()
dtr.fit(features, labels)

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
scores = cross_validate(dtr, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores.keys()

print('Decision Tree Model:')
metric_CV()

In [None]:
############# Random Forest Model #################
# Instantiate model with 100 decision trees
rfr = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rfr.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rfr.predict(test_features)
pred_trn=rfr.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(rfr.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('Lasso Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
# scatter plot of the test labels vs predictions (without cross_val_predict)

# plot
p2=sns.regplot(x=test_labels, y=predictions, line_kws={"color":"r","alpha":0.7,"lw":5})
#plt.scatter(test_labels, predictions)

#p1=sns.regplot(x=test_labels, y=predictions, line_kws={"color":"r","alpha":0.7,"lw":5})
p2.set(xlabel='Test Labels', ylabel='Predicted Labels', title='Random Forest Model: Test Vs Predicted Labels')
plt.show()

In [None]:
########################## RANDOM FOREST WITH CROSS_VALIDATION #########################################
rfr.cv = RandomForestRegressor(n_estimators=100, random_state = 42)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
scores = cross_validate(rfr.cv, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Random Forest Model:')
metric_CV()



In [None]:
# cross validation score evaluation. specifically the test values of each of the 10 cross fold
test_r2 = round(scores['test_r2'].mean(), 3)
train_r2 = round(scores['train_r2'].mean(), 3)
rmse = round(np.sqrt(-scores['test_neg_mean_squared_error'].mean()),5)
mae = round(-scores['test_neg_mean_absolute_error'].mean(), 5)
exp_var = round(scores['test_explained_variance'].mean(), 3)
            
print('\nDecision Tree Model Metrics:',
                  '\n\nTrain R-squared:\t', train_r2,
                  '\nTest R-squared:\t\t', test_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nMAE:\t\t\t', mae,
                  '\nExplained Variance:\t', exp_var)




# This method is using the test labels and train features to find the cross vallue r2 for test and train, 
# The rmse, mae, and explained variance score on test and prediction 
test_r2 = round(r2_score(test_labels, predictions), 3)
# Train the model on training data
rfr.cv.fit(train_features, train_labels)
train_r2 = round(rfr.cv.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
mae = round(mean_absolute_error(test_labels, predictions), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
            
print('\nRandom Forest Regressor Model Metrics:',
                  '\n\nTrain R-squared:\t', train_r2,
                  '\nTest R-squared:\t', test_r2,
                  '\nRMSE:\t', rmse,
                  '\nMAE:\t', mae,
                  '\nExplained Variance:\t', exp_var)

In [None]:
# scatter plot of the test labels vs predictions (from cross_val_predict)
# scatter plot of the test labels vs predictions (without cross_val_predict)

# plot
sns.regplot(x=test_labels, y=predictions, line_kws={"color":"r","alpha":0.7,"lw":5})
plt.scatter(test_labels, predictions)

In [None]:
train_features

In [None]:
################# Random Forest with Randomized grid search ####################
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfr = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rfr_random = RandomizedSearchCV(estimator=rfr, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rfr_random.fit(train_features, train_labels);

In [None]:
best_random = rfr_random.best_estimator_
best_random
#rfr_random.best_params_
predictions = best_random.predict(test_features)

# Instantiate model with 100 decision trees
#rfr_best = RandomForestRegressor(n_estimators = 100, random_state = 42)

# Train the model on training data
#rfr.fit(train_features, train_labels)

# Use the forest's predict method on the test data
#predictions = rfr.predict(test_features)
#pred_trn=rfr.predict(train_features)

test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(best_random.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mae = round(mean_absolute_error(test_labels, predictions), 5)
mae_trn = round(mean_absolute_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

print('\nRandom Forest Regressor Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
predictions = model
pred_trn=rfr.predict(train_features)

In [None]:
#############################  Support Vector Machine ##################################
from sklearn.svm import SVR
#n_samples, n_features = 10, 5
#rng = np.random.RandomState(0)
clf = SVR(gamma='scale', C=1.0, epsilon=0.1)
#clf = SVR(kernel='linear')
# Train the model on training data
clf.fit(train_features, train_labels) 



# Use the forest's predict method on the test data
predictions = clf.predict(test_features)
pred_trn=clf.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(clf.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)

explained_variance_score(train_labels, pred_trn)
print('Random Forest Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn)

In [None]:
########################## SUPPORT VECTOR MACHINE WITH CROSS_VALIDATION #########################################
#clf_cv = RandomForestRegressor(n_estimators=100, random_state = 42)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
scores = cross_validate(clf, train_features, train_labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Support Vector Machine Model:')
metric_CV()

#############################################################################################################################
#############################################################################################################################
##  REPEAT THIS MODEL WITH NORMALIZED MODEL FOR IMPROVEMENT
#############################################################################################################################
#############################################################################################################################

In [None]:
# normalize using Yeo Johnson:
#pt = PowerTransformer()


# load data using pandas
#features=pd.read_csv('C:/Users/Dhaval/Documents/CSC 672/Final_Project_Dhaval_Delvadia/2015contest_CSV/new_data/df_solararray_complete.csv')
features=pd.read_csv('../input/df_solararray_complete.csv')
features.head(3)

In [None]:
# note: we only need to normalize continuous variables
continuous_features = features.iloc[:,2:16]
continuous_features.head()
names=list(continuous_features.columns)


In [None]:
#pt.fit(continuous_features)
#pt.transform(continuous_features)

#names=features.columns
#df_norm = pd.DataFrame(pt.transform(continuous_features), columns=names)
#df_norm.head(10)

df = pd.DataFrame(continuous_features, columns=names)
df.head(10)


# labels are the values we want to predict
labels=np.array(df['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=df.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
features_list=list(features.columns)

#convert to numpy array
features=np.array(features)

# normalize using Yeo-Johnson:
pt3 = PowerTransformer()
x_t3=pt3.fit_transform(features)
pt4 = PowerTransformer()
y_t4=pt4.fit_transform(labels.reshape(-1,1))

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(x_t3, y_t4, test_size = 0.20, random_state = 42)

# let's review the shape of each features
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


In [None]:
# labels are the values we want to predict
#labels_norm=np.array(df_norm['Electricity_KW_HR'])

# Remove the labels from the features axis 1 refers to the columns
#features_norm=df_norm.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
#features_norm_list=list(features_norm.columns)

#convert to numpy array
#features_norm=np.array(features_norm)

# Split the data into training and testing sets
#train_features, test_features, train_labels, test_labels = train_test_split(features_norm, labels_norm, test_size = 0.20, random_state = 42)

# let's review the shape of each features
# print('Training Features Shape:', train_features.shape)
# print('Training Labels Shape:', train_labels.shape)
# print('Testing Features Shape:', test_features.shape)
# print('Testing Labels Shape:', test_labels.shape)

In [None]:
################ Linear Regression ######################
# Create linear regression object
lm = linear_model.LinearRegression()

# Train the model using the training sets
lm.fit(train_features, train_labels)

# Make predictions using the testing set
predictions = lm.predict(test_features)
pred_trn=lm.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(lm.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)



In [None]:
################ Linear Regression with Cross Validation ######################
# Create linear regression object
scores = cross_validate(lm, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#scores.keys()
#predictions = cross_val_predict(lm, features, test_labels, cv=10)

print('Linear Regression:')
metric_CV()

In [None]:
scores.keys()

In [None]:
################### Lasso Regression Model ########################################
lasso = linear_model.Lasso(alpha=1, max_iter=100)
lasso.fit(train_features, train_labels)
 
# Make predictions using the testing set
predictions = lasso.predict(test_features)
pred_trn=lasso.predict(train_features)

test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(lasso.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)



explained_variance_score(train_labels, pred_trn)
print('Lasso Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)

In [None]:
################### Lasso Regression Model with Cross Validation ########################################
lasso_CV = LassoCV(cv=10, random_state=0).fit(train_features, train_labels)
#lasso_CV.score(train_features, train_labels) 
scores = cross_validate(lasso_CV, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores.keys()

print('Lasso Regression:')
metric_CV()

In [None]:
################### Ridge Regression ##################################
rdg = Ridge(alpha=1.0)
rdg.fit(train_features, train_labels) 

# Make predictions
predictions=rdg.predict(test_features)
pred_trn=lm.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(rdg.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)


explained_variance_score(train_labels, pred_trn)
print('Ridge Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)

In [None]:
################## Ridge Regression with Cross Validation ####################
#rdg_CV = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(features_norm, labels_norm)
#rdg_CV.score(features, labels) 
rdg_CV = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(x_t3, y_t4)

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
#scores = cross_validate(rdg_CV, features_norm, labels_norm, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean())
scores = cross_validate(rdg_CV, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()

print('Ridge Regression:')
metric_CV()

In [None]:
################## ElasticNet Model #######################
EN = ElasticNet(random_state=0)
EN.fit(train_features, train_labels)
predictions=EN.predict(test_features)

# Make predictions
predictions=EN.predict(test_features)
pred_trn=EN.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(EN.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

#explained_variance_score(train_labels, pred_trn)
print('\nLinear Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)

In [None]:
y_t4.ravel()

In [None]:
################## ElasticNet Regression Model with Cross Validation #######################
#EN_CV = ElasticNetCV(cv=10, random_state=0).fit(features_norm, labels_norm)
EN_CV = ElasticNetCV(cv=10, random_state=0).fit(x_t3, y_t4.ravel())

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
#scores = cross_validate(EN_CV, features_norm, labels_norm, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
#print(scores.mean()) 
scores = cross_validate(EN_CV, x_t3, y_t4.ravel(), cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()

print('Elastic Net Regression:')
metric_CV()

In [None]:
############### Decision Tree model ######################
dtr = tree.DecisionTreeRegressor()
dtr.fit(train_features, train_labels)

# Make predictions
predictions=dtr.predict(test_features)
pred_trn=dtr.predict(train_features)

# calculate metrics
test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(dtr.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),3)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),3)
mse = round(mean_squared_error(test_labels, predictions), 3)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 3)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

#explained_variance_score(train_labels, pred_trn)
print('\nDecision Tree Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)

In [None]:
################## Decision Tree Model with Cross Validation #######################
dtr = tree.DecisionTreeRegressor()
#dtr.fit(features_norm, labels_norm)
dtr.fit(x_t3, y_t4)

# cross validation score evaluation. specifically the test values of each of the 10 cross fold
#scores = cross_validate(dtr, features_norm, labels_norm, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores = cross_validate(dtr, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])

#print(scores.mean()) 
scores.keys()

print('Decision Tree Model:')
metric_CV()

In [None]:
############# Random Forest Model #################
# Instantiate model with 100 decision trees
rfr = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rfr.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rfr.predict(test_features)
pred_trn=rfr.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(rfr.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

explained_variance_score(train_labels, pred_trn)
print('Random Forest Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)


from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rfr.get_params())

In [None]:
predictions

In [None]:
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), names), reverse=True))

In [None]:
# scatter plot of the test labels vs predictions (without cross_val_predict)

features=pd.read_csv('../input/df_solararray_complete.csv')
# note: we only need to normalize continuous variables
continuous_features = features.iloc[:,2:16]
continuous_features.head()
names=list(continuous_features.columns)

df = pd.DataFrame(continuous_features, columns=names)
df.head(10)

# labels are the values we want to predict
labels=np.array(df['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=df.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
features_list=list(features.columns)

#convert to numpy array
features=np.array(features)#

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)
#test_labels=np.array(test_labels)


x=pd.DataFrame(pt4.inverse_transform(predictions.reshape(-1,1)))
#x
#test_labels=pt4.inverse_transform(test_labels.reshape(-1,1))
import seaborn as sns
# plot
sns.regplot(x=test_labels, y=x.iloc[:,0], line_kws={"color":"r","alpha":0.7,"lw":5})
#plt.scatter(test_labels, x)

In [None]:
x.iloc[:,0]

In [None]:
p4=sns.regplot(x=test_labels, y=x.iloc[:,0], line_kws={"color":"r","alpha":0.7,"lw":5})
p4.set(xlabel='Test Labels', ylabel='Predicted Labels', title='Random Forest Transformed: Test Vs Predicted Labels')
plt.show()

In [None]:
test_labels

In [None]:
x[:,0]

In [None]:
ax=plt.plot(test_labels, alpha=0.9, color='orange')
ax=plt.plot(x.iloc[:,0], alpha=0.7, color='g')
plt.xlabel('index')
plt.ylabel('Electricity_KW_HR')
plt.title('Actual vs. Predicted Electricity_KW_HR')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [30,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
#plt.legend(handles=[green_patch, orange_patch], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
#plt.figure(figsize=(10,12))


plt.show()

In [None]:
# lets plot the output of actual vs predicted for first 100 values instead of all as shown above.

ax=plt.plot(test_labels[0:100], alpha=0.8, color='orange')
ax=plt.plot(x.iloc[:100,0], alpha=0.7, color='g')
plt.xlabel('index')
plt.ylabel('Electricity_KW_HR')
plt.title('Actual vs. Predicted Electricity_KW_HR')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [30,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
#plt.legend(handles=[green_patch, orange_patch], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
#plt.figure(figsize=(10,12))


plt.show()

In [None]:
########################## RANDOM FOREST WITH CROSS_VALIDATION #########################################
rfr.cv = RandomForestRegressor(n_estimators=100, random_state = 42)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
#scores = cross_validate(rfr.cv, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores = cross_validate(rfr.cv, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Random Forest Model:')
metric_CV()

In [None]:
# scatter plot of the test labels vs predictions (from cross_val_predict)
# scatter plot of the test labels vs predictions (without cross_val_predict)
import seaborn as sns
# plot
p1=sns.regplot(x=test_labels, y=predictions, line_kws={"color":"r","alpha":0.7,"lw":5})
p1.set(xlabel='Test Labels', ylabel='Predicted Labels', title='Random Forest CV Model: Test Vs Predicted Labels')
plt.show()
#plt.scatter(test_labels, predictions)

In [None]:
###########################  RESULT IS PASSED TO THE NEXT LINE (NO NEED TO DO THIS AGAIN)  #######################
################# Random Forest with Randomized grid search ######################################################
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)


########## (TAKES A LONG TIME SINCE ITS A GRID SEARCH) Use the random grid to search for best hyperparameters ############
# First create the base model to tune
rfr = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rfr_random = RandomizedSearchCV(estimator=rfr, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rfr_random.fit(train_features, train_labels)

# We can view the best parameters from fitting the random search:
rfr_random.best_params_

# fine tune it around the best numbers
best_random = rfr_random.best_estimator_
best_random

In [None]:
# Best find the predict the output
rfr.cv = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None,
           oob_score=False, random_state=42, verbose=0)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
#scores = cross_validate(rfr.cv, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores = cross_validate(rfr.cv, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Random Forest Model:')
metric_CV()

In [None]:
#############################################################
########## let's remove 'year' feature and re-run random forest again #############
# load solararray complete data using pandas
df=pd.read_csv('../input/df_solararray_complete.csv')
df.head(3)

df=df.drop(['Unnamed: 0', 'Location', 'Year'], axis=1)
names=list(df.columns)
df.head(3)

#df = pd.DataFrame(continuous_features, columns=names)
#df.head(10)


# labels are the values we want to predict
labels=np.array(df['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=df.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
features_list=list(df.columns)

#convert to numpy array
features=np.array(features)

# normalize using Yeo-Johnson:
pt3 = PowerTransformer()
x_t3=pt3.fit_transform(features)
pt4 = PowerTransformer()
y_t4=pt4.fit_transform(labels.reshape(-1,1))

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(x_t3, y_t4, test_size = 0.20, random_state = 42)

# let's review the shape of each features
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
test_labels

In [None]:
############# Random Forest Model #################
# Instantiate model with 100 decision trees
rfr = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rfr.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rfr.predict(test_features)
pred_trn=rfr.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(rfr.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

explained_variance_score(train_labels, pred_trn)
print('Random Forest Regression Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)


from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rfr.get_params())

In [None]:
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), names), reverse=True))

In [None]:
ax=plt.plot(pd.DataFrame(pt4.inverse_transform(test_labels.reshape(-1,1))), alpha=0.2, color='blue')
ax=plt.plot(x.iloc[:,0], alpha=0.2, color='red')
plt.xlabel('index')
plt.ylabel('Electricity_KW_HR')
plt.title('Actual vs. Predicted Electricity_KW_HR')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [30,15]
green_patch = mpatches.Patch(color='blue', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='red', label='Predicted Electricity_KW_HR by RF')
#plt.legend(handles=[green_patch, orange_patch], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
#plt.figure(figsize=(10,12))


plt.show()

In [None]:
# lets plot the output of actual vs predicted for first 100 values instead of all as shown above.

ax=plt.plot(pt4.inverse_transform(test_labels.reshape(-1,1))[0:100,0], alpha=0.2, color='blue')
ax=plt.plot(x.iloc[:100,0], alpha=0.2, color='red')
plt.xlabel('index')
plt.ylabel('Electricity_KW_HR')
plt.title('Actual vs. Predicted Electricity_KW_HR')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [30,15]
green_patch = mpatches.Patch(color='blue', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='red', label='Predicted Electricity_KW_HR by RF')
#plt.legend(handles=[green_patch, orange_patch], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
#plt.figure(figsize=(10,12))


plt.show()

In [None]:
########################## RANDOM FOREST WITH CROSS_VALIDATION #########################################
rfr.cv = RandomForestRegressor(n_estimators=100, random_state = 42)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
#scores = cross_validate(rfr.cv, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores = cross_validate(rfr.cv, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Random Forest Model:')
metric_CV()

In [None]:
# Best find the predict the output
rfr.cv = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None,
           oob_score=False, random_state=42, verbose=0)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
#scores = cross_validate(rfr.cv, features, labels, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores = cross_validate(rfr.cv, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Random Forest Model:')
metric_CV()

In [None]:
#############################  Support Vector Machine ##################################
from sklearn.svm import SVR
n_samples, n_features = 10, 5
#rng = np.random.RandomState(0)
clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
# Train the model on training data
clf.fit(train_features, train_labels) 



# Use the forest's predict method on the test data
predictions = clf.predict(test_features)
pred_trn=clf.predict(train_features)


test_r2 = round(r2_score(test_labels, predictions), 3)
train_r2 = round(clf.score(train_features, train_labels), 3)
rmse = round(np.sqrt(mean_squared_error(test_labels, predictions)),5)
rmse_trn = round(np.sqrt(mean_squared_error(train_labels, pred_trn)),5)
mse = round(mean_squared_error(test_labels, predictions), 5)
mse_trn = round(mean_squared_error(train_labels, pred_trn), 5)
exp_var = round(explained_variance_score(test_labels, predictions), 3)
exp_var_trn = round(explained_variance_score(train_labels, pred_trn), 3)
mae= round(mean_absolute_error(predictions,test_labels), 5)
mae_trn=round(mean_absolute_error(pred_trn,train_labels), 5)

explained_variance_score(train_labels, pred_trn)
print('SUPPORT VECTOR MACHINE Model Metrics:',
                  '\n\nTest R-squared:\t\t', test_r2,
                  '\nTrain R-squared:\t', train_r2,
                  '\nRMSE:\t\t\t', rmse,
                  '\nTrain RMSE:\t\t', rmse_trn,
                  '\nMSE:\t\t\t', mse,
                  '\nTrain MSE:\t\t', mse_trn,
                  '\nExplained Variance:\t', exp_var,
                  '\nTrain Explained Variance:', exp_var_trn,
                  '\nMAE:\t\t\t', mae,
                  '\nTrain MAE:\t\t', mae_trn)

In [None]:
########################## SUPPORT VECTOR MACHINE WITH CROSS_VALIDATION #########################################
#clf_cv = RandomForestRegressor(n_estimators=100, random_state = 42)
#scores = cross_val_score(RFR, train_features, train_labels, cv = 10, scoring='r2')
scores = cross_validate(clf, x_t3, y_t4, cv = 10, scoring=['r2','neg_mean_squared_error','neg_mean_absolute_error', 'explained_variance'])
scores.keys()
#print(scores.mean()) 
#predictions = cross_val_predict(rfr.cv, test_features, test_labels, cv=10)

print('Support Vector Machine Model:')
metric_CV()

## Let's read the scenario file now for prediction on those six days

In [None]:
# load complete training data using pandas and then use the query, groupby and aggragate function

#features=pd.read_csv('C:/Users/Dhaval/Documents/CSC 672/Final_Project_Dhaval_Delvadia/2015contest_CSV/new_data/df_solararray_complete.csv')
features=pd.read_csv('../input/df_solararray_complete.csv')
features.head()
features=features.drop(['Unnamed: 0', 'Location', 'Year'], axis=1)
features.head()
# note: we only need to normalize continuous variables
#x_train = features.iloc[:,0:12]
#y_train = features.iloc[:,12]

# we need to impute the actual values of these 6 dates to find the mean value of the Electricity_KW_HR for comparasion with predicted values for these dates
q315=features.query('Month == 3 & Day == 15')
q626=features.query('Month == 6 & Day == 26')
q713=features.query('Month == 7 & Day == 3')
q1013=features.query('Month == 10 & Day == 13')
q1119=features.query('Month == 11 & Day == 19')
q1225=features.query('Month == 12 & Day == 25')

g315=q315.groupby(['Month','Day','Hour'])
g626=q626.groupby(['Month','Day','Hour'])
g713=q713.groupby(['Month','Day','Hour'])
g1013=q1013.groupby(['Month','Day','Hour'])
g1119=q1119.groupby(['Month','Day','Hour'])
g1225=q1225.groupby(['Month','Day','Hour'])

t315=pd.DataFrame(g315['Electricity_KW_HR'].agg(np.mean))
t626=pd.DataFrame(g626['Electricity_KW_HR'].agg(np.mean))
t713=pd.DataFrame(g713['Electricity_KW_HR'].agg(np.mean))
t1013=pd.DataFrame(g1013['Electricity_KW_HR'].agg(np.mean))
t1119=pd.DataFrame(g1119['Electricity_KW_HR'].agg(np.mean))
t1225=pd.DataFrame(g1225['Electricity_KW_HR'].agg(np.mean))

t315

In [None]:
# load scenario (test file) data using pandas and then sort the values by hour to keep the date and the hours the same
features_snr=pd.read_csv('../input/scenario.csv')
features_snr.head()
features_snr=features_snr.drop(['Unnamed: 0', 'City', 'Year', 'Day_of_week', 'HolidayName', 'School_Day', 'Weekdays'], axis=1)
#names_snr=list(features_snr.columns)
#names_snr
#values = {'Pressure': 998, 'Precipitation':0.0}
features_snr.fillna(np.mean, inplace=True)

#features_snr.query('Month == 3 & Day == 15')


# these are the six date values we will need for predicting
a=features_snr.query('Month == 3 & Day == 15').sort_values(by=['Hour'])
b=features_snr.query('Month == 6 & Day == 26').sort_values(by=['Hour'])
c=features_snr.query('Month == 7 & Day == 3').sort_values(by=['Hour'])
#d=features_snr.query('Month == 10 & Day == 13').sort_values(by=['Hour'])
e=features_snr.query('Month == 11 & Day == 19').sort_values(by=['Hour'])
#f=features_snr.query('Month == 12 & Day == 25').sort_values(by=['Hour']).fillna(values)


#test_dates_df = pd.concat([a, b,c,d,e,f])
#test_dates_df.head()


#x_test=test_dates_df.iloc[:,[0,1,2,4,5,6,7,8,9,10,11,3]]
#x_test.head()

features_snr.isna().sum()

In [None]:
features_snr

In [None]:
# fit the random forest regression model without 'Location' and 'year' using Yeo-Johnson

# load solararray complete data using pandas
features=pd.read_csv('../input/df_solararray_complete.csv')
features.head(3)

features=features.drop(['Unnamed: 0', 'Location', 'Year'], axis=1)
names=list(features.columns)
features.head(3)

# labels are the values we want to predict
labels=np.array(features['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=features.drop('Electricity_KW_HR',axis=1)

# saving feature names for later use
feature_list=list(features.columns)

#convert to numpy array
features=np.array(features)


# Split the data into training and testing sets
#train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size = 0.20, random_state = 42)

############# let's try Yeo-Johnson transformation #################
# normalize using Yeo-Johnson:
pt9 = PowerTransformer()
pt10 = PowerTransformer()
pt11=PowerTransformer()
pt12=PowerTransformer()    
   
X_t=pt9.fit_transform(features) 
y_t=pt10.fit_transform(labels.reshape(-1,1))

a_t=pt11.fit_transform(a)
b_t=pt11.fit_transform(b)
c_t=pt11.fit_transform(c)
#d_t=pt11.fit_transform(d)
e_t=pt11.fit_transform(e)
#f_t=pt11.fit_transform(f)


# random forest regression model
rfr1 = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None,
           oob_score=False, random_state=42, verbose=0)

#rfr1 = RandomForestRegressor(n_estimators=1400, random_state=42)
rfr1.fit(X_t, y_t)




# Create linear regression object
#lm = linear_model.LinearRegression()
# Train the model using the training sets
#lm.fit(train_features, train_labels)
# Make predictions using the testing set
#predictions = lm.predict(test_features)
#pred_trn=lm.predict(train_features)


# now let's predict for 6 dates
pred315=rfr1.predict(a_t)
prediction315=pt10.inverse_transform(pred315.reshape(-1,1))

pred626=rfr1.predict(b_t)
prediction626=pt10.inverse_transform(pred626.reshape(-1,1))

pred713=rfr1.predict(c_t)
prediction713=pt10.inverse_transform(pred713.reshape(-1,1))

#pred1013=rfr1.predict(d_t)
#prediction1013=pt10.inverse_transform(pred1013.reshape(-1,1))

pred1119=rfr1.predict(e_t)
prediction1119=pt10.inverse_transform(pred1119.reshape(-1,1))

#pred1225=rfr1.predict(f_t)
#prediction1225=pt10.inverse_transform(pred1225.reshape(-1,1))

rfr1.fit(features, labels)
prediction1013=rfr1.predict(d)

rfr1.fit(features, labels)
prediction1225=rfr1.predict(f)

In [None]:
prediction315

In [None]:
#plot the Pred_KW_HR vs. actual KW_HR for March 15th

ax=plt.plot(np.array(t315['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction315, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('March 15th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
orange_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
rmse = round(np.sqrt(mean_squared_error(t315, prediction315)), 5)
print('RMSE between actual vs predicted for March 15th date is: ',rmse)

In [None]:
ax=plt.plot(np.array(t626['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction626, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('June 26th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t713['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction713, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('July 13th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1013['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction1013, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('october 13th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1119['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction1119, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('November 19th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1225['Electricity_KW_HR']), alpha=0.7, color='orange')  ######## columns may not be correct for test location
ax=plt.plot(prediction1225, alpha=0.7, color='g')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('Dec 25th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='orange', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='g', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
# fit the random forest regression model without 'Location' and 'year' using Quantile Normal

# load solararray complete data using pandas
features=pd.read_csv('../input/df_solararray_complete.csv')
features.head(3)

features=features.drop(['Unnamed: 0', 'Location', 'Year', ], axis=1)
names=list(features.columns)
features.head(3)

# labels are the values we want to predict
labels=np.array(features['Electricity_KW_HR'])

# Remove the labels from the features
# axis 1 refers to the columns
features=features.drop('Electricity_KW_HR',axis=1)

#fatures.query('Month==3 & Day=15')

# saving feature names for later use
feature_list=list(features.columns)

#convert to numpy array
features=np.array(features)

############# let's try Quantile Normal transformation #################
qt1=QuantileTransformer(n_quantiles=10, random_state=0)
qt2=QuantileTransformer(n_quantiles=10, random_state=0)
qt3=QuantileTransformer(n_quantiles=10, random_state=0)
qt4=QuantileTransformer(n_quantiles=10, random_state=0)
qt5=QuantileTransformer(n_quantiles=10, random_state=0)
qt6=QuantileTransformer(n_quantiles=10, random_state=0)
qt7=QuantileTransformer(n_quantiles=10, random_state=0)
qt8=QuantileTransformer(n_quantiles=10, random_state=0)

a_t=qt1.fit_transform(a)
b_t=qt2.fit_transform(b)
c_t=qt3.fit_transform(c)
d_t=qt4.fit_transform(d)
e_t=qt5.fit_transform(e)
f_t=qt6.fit_transform(f)

X_t=qt7.fit_transform(features) 
y_t=qt8.fit_transform(labels.reshape(-1,1))


# random forest regression model
#rfr1 = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=70,
#           max_features='auto', max_leaf_nodes=None,
#           min_impurity_decrease=0.0, min_impurity_split=None,
#           min_samples_leaf=2, min_samples_split=2,
#           min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None,
#           oob_score=False, random_state=42, verbose=0)

#random forest with quantile normalization
#rfr2 = RandomForestRegressor(n_estimators=1400, random_state=42)
#rfr2.fit(X_t, y_t)

# Xgboost with quantile normalization
rfr2 = xgb.XGBRegressor(objective="reg:linear", random_state=42) # Xgboost model
rfr2.fit(X_t, y_t)

# now let's predict for 6 dates
pred315=rfr2.predict(a_t)
prediction315=qt8.inverse_transform(pred315.reshape(-1,1))

pred626=rfr2.predict(b_t)
prediction626=qt8.inverse_transform(pred626.reshape(-1,1))

pred713=rfr2.predict(c_t)
prediction713=qt8.inverse_transform(pred713.reshape(-1,1))

pred1013=rfr2.predict(d_t)
prediction1013=qt8.inverse_transform(pred1013.reshape(-1,1))

pred1119=rfr2.predict(e_t)
prediction1119=qt8.inverse_transform(pred1119.reshape(-1,1))

pred1225=rfr2.predict(f_t)
prediction1225=qt8.inverse_transform(pred1225.reshape(-1,1))

#rfr1.fit(features, labels)
#prediction1013=rfr1.predict(d)

In [None]:
#plot the Pred_KW_HR vs. actual KW_HR for March 15th

ax=plt.plot(np.array(t315['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction315, alpha=1, color='blue')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('March 15th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
#plt.gca().legend(('y0','y1'))
green_patch = mpatches.Patch(color='Red', label='Predicted Electricity_KW_HR by RF')
orange_patch = mpatches.Patch(color='blue', label='Actual Electricity_KW_HR')
plt.legend(handles=[green_patch, orange_patch], loc='upper right', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
rmse = round(np.sqrt(mean_squared_error(t315, prediction315)), 5)
print('RMSE between actual vs predicted for March 15th date is: ',rmse)

In [None]:
ax=plt.plot(np.array(t626['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction626, alpha=1, color='blue')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('June 26th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='red', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='blue', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='best', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t713['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction713, alpha=1, color='b')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('July 13th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='red', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='b', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='best', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1013['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction1013, alpha=1, color='b')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('October 13th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='red', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='b', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='best', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1119['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction1119, alpha=1, color='b')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('November 19th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='red', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='b', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='best', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
ax=plt.plot(np.array(t1225['Electricity_KW_HR']), alpha=1, color='red')  ######## columns may not be correct for test location
ax=plt.plot(prediction1225, alpha=1, color='b')
plt.xlabel('Hours')
plt.ylabel('Electricity_KW_HR')
plt.title('Dec 25th Day - Actual vs. Predicted Electricity (KWh)')

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,15]
green_patch = mpatches.Patch(color='red', label='Actual Electricity_KW_HR')
orange_patch = mpatches.Patch(color='b', label='Predicted Electricity_KW_HR by RF')
plt.legend(handles=[green_patch, orange_patch], loc='best', bbox_to_anchor=(1, 0.5))
plt.show()