In [1]:
#python 2/3 compatibility
from __future__ import absolute_import, division, print_function, unicode_literals
#packages
import pathlib
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import matplotlib.pyplot as plt
import seaborn as sns
#set graphs to plot within env
%matplotlib inline
#set style of graphs
sns.set(rc={'figure.figsize':(12, 6)})
plt.style.use('fivethirtyeight')



In [2]:
#add code to input data
#for now, uploads data file from local drive
dataset =  pd.read_csv('mergeCleCinSave.csv')

In [None]:
#view dist + corr of all features, uncomment to run
#sns.pairplot(dataset[['SF', 'Floors','Year Built', 'Value', 'E annual', 'G annual', 'Estimated Savings']], diag_kind="kde")

In [None]:
#view the target feature gas specifically to ID outliers and dist.
#uncommet to run
#fig, (boxplot, histogram) = plt.subplots(2, sharex=True, figsize=(16, 8), gridspec_kw={"height_ratios": (.15, .85)})
#sns.boxplot(dataset['Estimated Savings'], ax=boxplot)
#sns.distplot(dataset['Estimated Savings'], ax=histogram)
#boxplot.set(xlabel='')
#plt.title('Distribution of "Annual Estimated Savings"', fontsize=24)
#plt.xlabel('');

In [3]:
#split data into train/test sets 80%/20%
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [4]:
#creates training data stats 
train_stats = train_dataset.describe()
train_stats.pop("Estimated Savings")
train_stats = train_stats.transpose()

#save train stats for norm. function
scaler_filename = "trainPredSaveScaler.save"
joblib.dump(train_stats, scaler_filename)

['trainPredSaveScaler.save']

In [5]:
#creates train and test datasets target variable: Estimated Savings dollar value
#and removes target variable from feature variable dataset
train_labels = train_dataset.pop('Estimated Savings')
test_labels = test_dataset.pop('Estimated Savings')

In [6]:
#Function to normalize train and test datasets to dist. range from train data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [7]:
#normalized train and test datasets
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [1]:
#view first row of normalized dataset
#uncomment to run
#normed_test_data.head(1)

---DEEP LEARNING MODEL FRAMEWORK---

In [10]:
#Deep Learning model, 4 layers deep, 2 fully connected layers
#can adjust optimizer by commenting/uncommeting 'optimizer' variable
#loss func. set to mod. penalize model for large errors

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dropout(0.5), #50% data random dropout to reduce overfitting training data
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5), #50% data random dropout to reduce overfitting training data
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)
  #optimizer = tf.keras.optimizers.Adam(lr=0.001)

  model.compile(loss='mean_squared_logarithmic_error',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [11]:
#calls ML model and assigns it to the variable 'model'
model = build_model()

---TESTING/FINE-TUNING USE CODE BELOW---

In [None]:
#runs the above model for a set number of epochs

EPOCHS = 1000

history = model.fit(
    normed_train_data, train_labels,
    epochs=EPOCHS, validation_split = 0.2, verbose=0, 
    callbacks=[tfdocs.modeling.EpochDots()])

In [None]:
#shows last 5 epochs metrics of loss
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [3]:
#set plot with a smoothing parameter
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)

NameError: name 'tfdocs' is not defined

In [None]:
#Plots Mean Absolute Error
plotter.plot({'Basic': history}, metric = "mae")
plt.ylabel('MAE [Est. Gas Usage]')

In [None]:
#Plots Mean Squared Error
plotter.plot({'Basic': history}, metric = "mse")
plt.ylabel('MSE [Est. Savings^2]')

In [None]:
#Test/train with an early stopping parameter to prevent overfitting

model = build_model()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)

early_history = model.fit(normed_train_data, train_labels, 
                    epochs=EPOCHS, validation_split = 0.2, verbose=0, 
                    callbacks=[early_stop, tfdocs.modeling.EpochDots()])

In [None]:
#plots Mean Absolute Error for the early stopping model
plotter.plot({'Early Stopping': early_history}, metric = "mae")
plt.ylabel('MAE [Est. Savings]')

In [None]:
#plots predictions versus truth values
test_predictions = model.predict(normed_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [Est. Gas]')
plt.ylabel('Predictions [Est. Gas]')
lims = [0, 2000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
#dist. of errors by number of errors and diff. between predict and actual value
error = test_predictions - test_labels
plt.hist(error, bins = 20)
plt.xlabel("Prediction Error [Est. Savings]")
_ = plt.ylabel("Count")

In [None]:
#prints the metrics for the best epoch of the trained model
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print("Testing set Mean Abs Error: {:5.2f} Est. Gas Usage".format(mae))

---TO TRAIN AND SAVE A MODEL RUN BELOW---

In [12]:
# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=200)

model.fit(normed_train_data, train_labels, epochs=1000, 
          validation_split = 0.2, verbose=0, 
          callbacks=[early_stop])

<tensorflow.python.keras.callbacks.History at 0x13140cdefd0>

In [None]:
#plots predictions versus truth values
test_predictions = model.predict(normed_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [Est. Savings]')
plt.ylabel('Predictions [Est. Savings]')
lims = [0, 2000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
#dist. of errors by number of errors and diff. between predict and actual value
error = test_predictions - test_labels
plt.hist(error, bins = 20)
plt.xlabel("Prediction Error [Est. Savings]")
_ = plt.ylabel("Count")

In [13]:
#check models loss, mean actual error rate, mean squared error rate
#uncomment to run
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print("Testing set Mean Abs Error: {:5.2f} ".format(mae))

33/33 - 0s - loss: 0.2049 - mae: 192.3001 - mse: 58961.9141
Testing set Mean Abs Error: 192.30 


In [14]:
#saves TF model weights as name noted below
model.save('predSaveDollar.h5')

In [15]:
#save the model, serialized model to JSON format
model_json = model.to_json()
with open("predSaveModel.json", "w") as json_file:
    json_file.write(model_json)