In [None]:
# Title:     SmartHome Dataset
# Authors:   Aline J
# Date:      01/06/2020
# Goal:      Analyze and predict energy use efficiency 
#            in a smarthouse during a year under variable weather conditions.

# Questions: 
# 1) What are the variables that control generated energy? 
# 2) What are the variables that control used energy? 
# 3) What are the variables that control energy efficiency?

# Method. Unsupervised Machine learning Method, specifically random forest regression analysis.
# Results: Used CleanData
#         Several attempts to run the analysis were done at differen frequencies. 
#         Only Monthly frequency data resulted the highest accuracy (76%) vs Minute data (0.01%).  
#         The minimum random state of 50, were enough to explain the variability in the dataset.  
#         The same accuracy was yield when running random state of 1000.


In [1]:
### Begins Here ####
# 1. Import dependencies
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\programdata\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
# Other dependencies
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats
import string as string
import warnings
warnings.filterwarnings('ignore')

In [30]:
# 2. Import Raw data file into a data frame
csv_path = "Data/HomeC.csv"
my_data = pd.read_csv(csv_path,   parse_dates=True)
home_dat = my_data.select_dtypes(exclude=['object'])
nRow, nCol = my_data.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 503911 rows and 32 columns


In [31]:
home_dat.head(5)
home_dat.tail(5)


Unnamed: 0,use [kW],gen [kW],House overall [kW],Dishwasher [kW],Furnace 1 [kW],Furnace 2 [kW],Home office [kW],Fridge [kW],Wine cellar [kW],Garage door [kW],...,temperature,humidity,visibility,apparentTemperature,pressure,windSpeed,windBearing,precipIntensity,dewPoint,precipProbability
503906,1.599333,0.003233,1.599333,5e-05,0.104017,0.625033,0.04175,0.005233,0.008433,0.013433,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
503907,1.924267,0.003217,1.924267,3.3e-05,0.422383,0.637733,0.042033,0.004983,0.008467,0.012933,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
503908,1.9782,0.003217,1.9782,5e-05,0.495667,0.620367,0.0421,0.005333,0.008233,0.012817,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
503909,1.99095,0.003233,1.99095,5e-05,0.4947,0.634133,0.0421,0.004917,0.008133,0.012833,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
503910,,,,,,,,,,,...,,,,,,,,,,


In [32]:
time_index = pd.date_range('2016-01-01 05:00', periods=503911,  freq='min')  
time_index = pd.DatetimeIndex(time_index)
home_dat = home_dat.set_index(time_index)
# Check that a dataset has been uploaded into my_data

In [33]:
home_dat.head(5)
home_dat.tail(5)

Unnamed: 0,use [kW],gen [kW],House overall [kW],Dishwasher [kW],Furnace 1 [kW],Furnace 2 [kW],Home office [kW],Fridge [kW],Wine cellar [kW],Garage door [kW],...,temperature,humidity,visibility,apparentTemperature,pressure,windSpeed,windBearing,precipIntensity,dewPoint,precipProbability
2016-12-16 03:26:00,1.599333,0.003233,1.599333,5e-05,0.104017,0.625033,0.04175,0.005233,0.008433,0.013433,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:27:00,1.924267,0.003217,1.924267,3.3e-05,0.422383,0.637733,0.042033,0.004983,0.008467,0.012933,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:28:00,1.9782,0.003217,1.9782,5e-05,0.495667,0.620367,0.0421,0.005333,0.008233,0.012817,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:29:00,1.99095,0.003233,1.99095,5e-05,0.4947,0.634133,0.0421,0.004917,0.008133,0.012833,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:30:00,,,,,,,,,,,...,,,,,,,,,,


In [34]:
home_dat.columns = [col.replace(' [kW]', '') for col in home_dat.columns]
home_dat.columns

Index(['use', 'gen', 'House overall', 'Dishwasher', 'Furnace 1', 'Furnace 2',
       'Home office', 'Fridge', 'Wine cellar', 'Garage door', 'Kitchen 12',
       'Kitchen 14', 'Kitchen 38', 'Barn', 'Well', 'Microwave', 'Living room',
       'Solar', 'temperature', 'humidity', 'visibility', 'apparentTemperature',
       'pressure', 'windSpeed', 'windBearing', 'precipIntensity', 'dewPoint',
       'precipProbability'],
      dtype='object')

In [35]:
home_dat = home_dat.dropna(axis='columns', how='all')

In [36]:
home_dat = home_dat[0:-1] ## == dataset[0:dataset.shape[0]-1] == dataset[0:len(dataset)-1] == dataset[:-1]
home_dat.tail(3)


Unnamed: 0,use,gen,House overall,Dishwasher,Furnace 1,Furnace 2,Home office,Fridge,Wine cellar,Garage door,...,temperature,humidity,visibility,apparentTemperature,pressure,windSpeed,windBearing,precipIntensity,dewPoint,precipProbability
2016-12-16 03:27:00,1.924267,0.003217,1.924267,3.3e-05,0.422383,0.637733,0.042033,0.004983,0.008467,0.012933,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:28:00,1.9782,0.003217,1.9782,5e-05,0.495667,0.620367,0.0421,0.005333,0.008233,0.012817,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51
2016-12-16 03:29:00,1.99095,0.003233,1.99095,5e-05,0.4947,0.634133,0.0421,0.004917,0.008133,0.012833,...,35.12,0.86,8.74,29.45,1011.49,6.72,186.0,0.0101,31.27,0.51


In [37]:
home_dat['Furnace'] = home_dat[['Furnace 1','Furnace 2']].sum(axis=1)
home_dat['Kitchen'] = home_dat[['Kitchen 12','Kitchen 14','Kitchen 38']].sum(axis=1)

#df['Result'] = df['Column A']/df['Column B']
home_dat.columns

Index(['use', 'gen', 'House overall', 'Dishwasher', 'Furnace 1', 'Furnace 2',
       'Home office', 'Fridge', 'Wine cellar', 'Garage door', 'Kitchen 12',
       'Kitchen 14', 'Kitchen 38', 'Barn', 'Well', 'Microwave', 'Living room',
       'Solar', 'temperature', 'humidity', 'visibility', 'apparentTemperature',
       'pressure', 'windSpeed', 'windBearing', 'precipIntensity', 'dewPoint',
       'precipProbability', 'Furnace', 'Kitchen'],
      dtype='object')

In [38]:
DescriptiveStats=home_dat.describe()
DescriptiveStats.head()

Unnamed: 0,use,gen,House overall,Dishwasher,Furnace 1,Furnace 2,Home office,Fridge,Wine cellar,Garage door,...,visibility,apparentTemperature,pressure,windSpeed,windBearing,precipIntensity,dewPoint,precipProbability,Furnace,Kitchen
count,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,...,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0,503910.0
mean,0.858962,0.076229,0.858962,0.031368,0.09921,0.136779,0.081287,0.063556,0.042137,0.014139,...,9.253444,48.263382,1016.301625,6.649936,202.356843,0.002598,38.694013,0.056453,0.235989,0.009788
std,1.058207,0.128428,1.058207,0.190951,0.169059,0.178631,0.104466,0.076199,0.057967,0.014292,...,1.611186,22.027916,7.895185,3.982716,106.520474,0.011257,19.087939,0.165836,0.273885,0.079883
min,0.0,0.0,0.0,0.0,1.7e-05,6.7e-05,8.3e-05,6.7e-05,1.7e-05,1.7e-05,...,0.27,-32.08,986.4,0.0,0.0,0.0,-27.24,0.0,0.0003,0.0
25%,0.367667,0.003367,0.367667,0.0,0.020233,0.0644,0.040383,0.005083,0.007133,0.012733,...,9.42,31.09,1011.29,3.66,148.0,0.0,24.6,0.0,0.0853,0.0006


In [39]:
CleanData_per_min = home_dat.filter(items=[ 'gen', 'use', 'House overall', 
                               'Furnace', 'Home office',  'Living room',
                               'Wine cellar', 'Garage door', 'Kitchen',
                               'Barn', 'Well', 'Fridge', 'Microwave', 'Dishwasher',
                               'temperature', 'humidity', 
                               'visibility', 'apparentTemperature',
                               'pressure', 'windSpeed', 'windBearing', 'precipIntensity', 'dewPoint'])

In [40]:
WeatherData_per_min= home_dat.filter(items=[ 'gen', 'temperature', 'humidity', 
                               'visibility', 'apparentTemperature',
                               'pressure', 'windSpeed', 'windBearing', 'precipIntensity', 'dewPoint'])

In [41]:
RoomsData_per_min = home_dat.filter(items=[ 'gen', 'House overall', 
                               'Furnace', 'Home office',  'Living room',
                               'Wine cellar', 'Garage door', 'Kitchen',
                               'Barn', 'Well', 'Fridge', 'Microwave', 'Dishwasher'])

In [42]:
home_dat['EneEffi'] = home_dat['use']-home_dat['gen']

In [43]:
EnergyData_per_min = home_dat.filter(items=['EneEffi', 'gen', 'use', 'House overall'])

In [44]:
EnergyData.tail(3)

Unnamed: 0,EneEffi,gen,use,House overall
2016-12-16 03:27:00,1.92105,0.003217,1.924267,1.924267
2016-12-16 03:28:00,1.974983,0.003217,1.9782,1.9782
2016-12-16 03:29:00,1.987717,0.003233,1.99095,1.99095


In [27]:
# Summaries
# Generate Data per (day) and (month) 
CleanData_per_day = CleanData.resample('D').sum()
WeatherData_per_day = WeatherData.resample('D').sum()
RoomsData_per_day = WeatherData.resample('D').sum()
EnergyData_per_day = WeatherData.resample('D').sum()

CleanData_per_month = CleanData.resample('M').sum()
WeatherData_per_month = WeatherData.resample('M').sum()
RoomsData_per_month = WeatherData.resample('M').sum()
EnergyData_per_month = WeatherData.resample('M').sum()

In [29]:
# Save Data to Directory
CleanData_per_min.to_csv  (r'CleanData_per_min.csv')
EnergyData_per_min.to_csv (r'EnergyData_per_min.csv')
RoomsData_per_min.to_csv  (r'RoomsData_per_min.csv')
WeatherData_per_min.to_csv(r'WeatherData_per_min.csv')

CleanData_per_day.to_csv  (r'CleanData_per_day.csv')
EnergyData_per_day.to_csv (r'EnergyData_per_day.csv')
RoomsData_per_day.to_csv  (r'RoomsData_per_day.csv')
WeatherData_per_day.to_csv(r'WeatherData_per_day.csv')

CleanData_per_month.to_csv  (r'CleanData_per_month.csv')
EnergyData_per_month.to_csv (r'EnergyData_per_month.csv')
RoomsData_per_month.to_csv  (r'RoomsData_per_month.csv')
WeatherData_per_month.to_csv(r'WeatherData_per_month.csv')

In [45]:
# Data Preparation
CleanData_per_month.columns
CleanData_per_month.dtypes 
#CleanData.tail()

gen                    float64
use                    float64
House overall          float64
Furnace                float64
Home office            float64
Living room            float64
Wine cellar            float64
Garage door            float64
Kitchen                float64
Barn                   float64
Well                   float64
Fridge                 float64
Microwave              float64
Dishwasher             float64
temperature            float64
humidity               float64
visibility             float64
apparentTemperature    float64
pressure               float64
windSpeed              float64
windBearing            float64
precipIntensity        float64
dewPoint               float64
dtype: object

In [None]:
# Running RF with all the data to investigate the driving variables of energy efficiency at a 1min resolution
#my_data.dtypes 
features=CleanData_per_month
type(features)
#features.iloc[:,5:].head(5)
#features.head(5)
#features.info()
features.columns

In [None]:
# Labels are the values we want to predict
labels = np.array(features['gen'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('gen', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [None]:
# Training and Testing Sets
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 100)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
feature_list.index

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 10 decision trees
#rf = RandomForestRegressor(n_estimators = 100, random_state = 100)
rf = RandomForestRegressor(n_estimators = 100, random_state = 100)
# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
#Mean Absolute Error: 8512.05 degrees

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
#Accuracy: 95.96 %.
    # That looks pretty good! Our model has learned how to predict the List price as a function of list and close price in Houston with 95.96% accuracy.

In [None]:
plot = all_energy_effi.plot(kind = "bar", figsize = (5,5))
plot.set_title("energy efficiency")

In [None]:
# What are the variables that regulate power generation?
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]

In [None]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree')
# Write graph to a png file
graph.write_png('tree.png')

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
plt.figure();
feature_importances.plot();

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=1000)
# Extract the two most important features
important_indices = [feature_list.index('temperature'), feature_list.index('dewPoint')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train the random forest
rf_most_important.fit(train_important, train_labels)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

#Note:
#This tells us that temperature and dew point are the two most important variables to regulate generated energy 

In [None]:
# Save the model
#model.save("EnergyGenerated_trained.h5")
