# Time Series Prediction


In [1]:
# @ Lukman copyright 
# MIT Licence

In [2]:
# for data frame analysis
import pandas as pd 

# for mathematical operations
import numpy as np 


# matplotlib library for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# For Normalizing data
from sklearn.preprocessing import StandardScaler

# For statistical test
import scipy.stats as stats

# Split data set into training and test set
from sklearn.model_selection import train_test_split as tts

# SVN module
from sklearn import svm

# Kernel Functions used 
from sklearn.metrics.pairwise import rbf_kernel,laplacian_kernel

# module for chi square test
from scipy.stats import chisquare


# For dictionary 
from collections import defaultdict

# for use of tensorflow
import tensorflow as tf
tf.enable_eager_execution()

#from tensorflow.nn.rnn import *
from tensorflow.python.ops  import *

# for scaling arrays
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler


# for random sampling of validation set
import random

import yaml
import os
from utility import utils

%load_ext autoreload
%autoreload 2

In [3]:
import sys

In [4]:
# enable eager executinon
tf.enable_eager_execution()

In [5]:
tf.__version__

'1.15.0'

# Data Preprocessing 

In [6]:
################################################################################
# load config file
try:
    with open("../config.yml", 'r') as ymlfile:
            cfg = yaml.safe_load(ymlfile)
except (IOError):
    print('config file is required. Put config file in current directory')
################################################################################

In [7]:
# current working directory
cwd = os.getcwd()
# set the base directory. base directo
BASE_DIR = os.path.join( os.path.dirname( cwd), '' )
# cleaned data dir
cleanedpath= BASE_DIR + cfg['cleanedconfig']['cleanedDataV1']['cleanedDir'] 
# clean data Name
cleandataName = cfg['cleanedconfig']['cleanedDataV1']['cleanedName']
# traina and validation data name
trainName= cfg['cleanedconfig']['cleanedDataV1']['TensorflowDataFlowConfig']['tfcleanTrain']
validateName  = cfg['cleanedconfig']['cleanedDataV1']['TensorflowDataFlowConfig']['tfcleanValidate']
validation_split = cfg['cleanedconfig']['cleanedDataV1']['validationSplit']

In [8]:
# boolean for preprocessing datasets for tensorflow
tfreprocess = cfg['cleanedconfig']['cleanedDataV1']['TensorflowDataFlowConfig']['tfprocess']

In [9]:
tfreprocess

True

In [10]:
cleanedpath

'C:\\Users\\Fikret\\Documents\\GitHub\\Energy_Prediction_Bot\\Dataset/trainData/cleaned/'

In [11]:
cleandataName

'cleanedDataV1.csv'

In [12]:
trainName

'tftrains.csv'

In [13]:
validateName

'tfvals.csv'

In [14]:
validation_split

1000

In [15]:
if tfreprocess:
    
    _,_,data_types,col_names= utils.tfdatabuilder(cleanedpath,cleandataName,trainName, validateName,validation_split )
    
else:
    train = pd.read_csv(cleanedpath + trainName) 
    data_types,col_names = utils.tfpreprocess(train)

['meter_reading', 'square_feet', 'air_temperature', 'dew_temperature', 'month', 'day', 'hour', 'primary_use_Education', 'primary_use_Entertainment/public assembly', 'primary_use_Food sales and service', 'primary_use_Healthcare', 'primary_use_Lodging/residential', 'primary_use_Manufacturing/industrial', 'primary_use_Office', 'primary_use_Other', 'primary_use_Parking', 'primary_use_Public services', 'primary_use_Religious worship', 'primary_use_Retail', 'primary_use_Technology/science', 'primary_use_Utility', 'site_id_0', 'site_id_2', 'site_id_6', 'site_id_7', 'site_id_9', 'site_id_10', 'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15']
[tf.float64, tf.int32, tf.float64, tf.float64, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32]
Done


#### Work with downloaded csv to preserve data type
Load the csv file and parse the data types of the datatset together into the
tensorflow load pipeline

In [16]:
# load the data using the tensorflow Dataset API
# add the data types too
csvData = tf.data.experimental.CsvDataset(cleanedpath + trainName, data_types, header=True)

In [17]:
# categorical features to use 
catfeatures = cfg['cleanedconfig']['cleanedDataV1']['TensorflowDataFlowConfig']['categoricalFeatureToUse']
catNames = cfg['cleanedconfig']['cleanedDataV1']['TensorflowDataFlowConfig']['CategoricalFeature']

In [18]:
catfeatures

3

In [19]:
catNames

['month_day_hour', 'primary_use', 'site_id']

#### Parse data 

In [20]:
col_names

['meter_reading',
 'square_feet',
 'air_temperature',
 'dew_temperature',
 'month',
 'day',
 'hour',
 'primary_use_Education',
 'primary_use_Entertainment/public assembly',
 'primary_use_Food sales and service',
 'primary_use_Healthcare',
 'primary_use_Lodging/residential',
 'primary_use_Manufacturing/industrial',
 'primary_use_Office',
 'primary_use_Other',
 'primary_use_Parking',
 'primary_use_Public services',
 'primary_use_Religious worship',
 'primary_use_Retail',
 'primary_use_Technology/science',
 'primary_use_Utility',
 'site_id_0',
 'site_id_2',
 'site_id_6',
 'site_id_7',
 'site_id_9',
 'site_id_10',
 'site_id_11',
 'site_id_13',
 'site_id_14',
 'site_id_15']

In [21]:
catNames

['month_day_hour', 'primary_use', 'site_id']

In [22]:
# Parsing the data and
# merging the columns:
# ---- day and month as single column
# ---- merge the site id as a single feature vector vector
# ---- primary use as a single feature vector

# take the forst 4 col names that meter reading, square feet, air temp , dew temp,
# add it to a the new column names that would be formed after single column merging

#categorical feature selection
# here one can decide if to use site id or not . Site id is index 2 from catnames
if catfeatures == 3:
    col_names_= [col_names[0],col_names[1],col_names[2],col_names[3],catNames[0],catNames[1],catNames[2] ]
else:
    col_names_= [col_names[0],col_names[1],col_names[2],col_names[3],catNames[0],catNames[1]]

def _parse_csv_row(*vals):
    '''
    Uses Feature columns
    Does feature engineering
    '''
    
    # month and day and single feature
    month_day = tf.convert_to_tensor(vals[4:7])
    # primary use as single feature
    primary_use = tf.convert_to_tensor(vals[7:21]) # this index can be chage but ensure consistency 
    if catfeatures == 3:
        # site id as single feature
        site_id =  tf.convert_to_tensor(vals[21:31])
    
        # merge the features together, note meter reading is the fist column
        # so it is excluded -- meaning index starts from 1
        feature_vals = vals[1:4] + (month_day,primary_use,site_id)
    else:
        feature_vals = vals[1:4] + (month_day,primary_use)
        
    
    
    # zip into a  feature sets into a single col
    features = dict(zip(col_names_[1:],feature_vals))
    
    # name the targets or lables
    targets_tensor = tf.convert_to_tensor(vals[0],name=col_names_[0]) 
    
    
    
    return features, targets_tensor

In [23]:
col_names_

['meter_reading',
 'square_feet',
 'air_temperature',
 'dew_temperature',
 'month_day_hour',
 'primary_use',
 'site_id']

##### Batch the datatset

In [24]:
# this is needed for testing
dataset = csvData.map(_parse_csv_row).batch(64)

In [25]:
dataset

<DatasetV1Adapter shapes: ({square_feet: (?,), air_temperature: (?,), dew_temperature: (?,), month_day_hour: (?, 3), primary_use: (?, 14), site_id: (?, 10)}, (?,)), types: ({square_feet: tf.int32, air_temperature: tf.float64, dew_temperature: tf.float64, month_day_hour: tf.int32, primary_use: tf.int32, site_id: tf.int32}, tf.float64)>

##### Feature Categorization

In [27]:
primary_use = tf.feature_column.numeric_column(catNames[1], shape=(14,))
site_id = tf.feature_column.numeric_column(catNames[2], shape=(10,))
month_day = tf.feature_column.numeric_column(catNames[0], shape=(3,))

In [33]:
print(primary_use)
print (site_id)
print(month_day)

NumericColumn(key='primary_use', shape=(14,), default_value=None, dtype=tf.float32, normalizer_fn=None)
NumericColumn(key='site_id', shape=(10,), default_value=None, dtype=tf.float32, normalizer_fn=None)
NumericColumn(key='month_day_hour', shape=(3,), default_value=None, dtype=tf.float32, normalizer_fn=None)


In [28]:
#
numeric_columns = [tf.feature_column.numeric_column(feat) for feat in col_names_[1:4]]

In [32]:
numeric_columns

[NumericColumn(key='square_feet', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='air_temperature', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='dew_temperature', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [29]:
#columns = numeric_columns + [month_day,primary_use,site_id]

In [30]:
#feature_layer = tf.keras.layers.DenseFeatures(columns)

In [31]:
# function to serve the data set 
def load_data(*filename,fn = _parse_csv_row,training=True):
    Training = training
    batchsize = 64
    csvData = tf.data.experimental.CsvDataset(filename, data_types, header=True)
    dataset= csvData.map(fn)
    if Training:
        dataset = dataset.shuffle(1000).repeat()
    return dataset.batch(batchsize)
    
    

# Build Model

In [34]:
from tensorflow.keras import backend as K

def root_mean_squared_error():
    '''
    
    this is the loss function specified in kaggle competition
    '''
        
    return K.sqrt(K.mean (K.square( (K.log(K.abs(y_pred)+1) - K.log(y_true+1))   )   )   )



def root_mean_squared_error2():
    '''
    this is 1.- mean squared
    root mean squared is a value between 0 and 1 
    to reflect accuracy use 1-rms
    '''
        
    return  K.sqrt(K.mean( ( K.square ( y_pred - y_true    )   )   )) 

In [None]:
# wide and deep model 
# https://arxiv.org/pdf/1606.07792.pdf
model = tf.estimator.DNNLinearCombinedRegressor(
          # wide settings
         linear_feature_columns=[month_day, primary_use,site_id ],
         linear_optimizer=tf.train.FtrlOptimizer(1e-6,l2_regularization_strength=1e-8 ),
         # deep settings
         dnn_feature_columns=[
           primary_use, site_id,
           numeric_columns[0],numeric_columns[1],numeric_columns[2]],
        dnn_hidden_units=[600, 300, 100],
        dnn_dropout=0.3,
        dnn_optimizer=tf.train.ProximalAdagradOptimizer(1e-6,l2_regularization_strength=1e-10)
      )


In [None]:
# add the metrics to use in the computation
model = tf.contrib.estimator.add_metrics(model, root_mean_squared_error)

In [None]:
# run training

for i in range(3):
    model.train(input_fn=lambda : load_data(cleanedpath + trainName), steps=100)

## Testing
please skip testing

In [None]:
def _parse_csv_row2(*vals):
    '''
    Uses Feature columns
    Does feature engineering
    '''
    
    # month and day and single feature
    month_day = tf.convert_to_tensor(vals[4:7])
    # primary use as single feature
    primary_use = tf.convert_to_tensor(vals[7:21])
    # site id as single feature
    site_id =  tf.convert_to_tensor(vals[21:30])
    
    # merge the features together, note meter reading is the fist column
    # so it is excluded -- meaning index starts from 1
    feature_vals = vals[1:4] + (month_day,primary_use,site_id)
    
    
    # zip into a  feature sets into a single col
    features = dict(zip(col_names_[1:],feature_vals))
    
    # name the targets or lables
    targets_tensor = tf.convert_to_tensor(vals[0],name=col_names_[0]) 
    
    
    
    return features

In [None]:
# function to serve the data set 
def load_data2(*filename,fn = _parse_csv_row2):
    batchsize = 64
    csvData = tf.data.experimental.CsvDataset(filename, data_types, header=False)
    dataset= csvData.map(fn)
    return dataset.batch(batchsize)
    
    

In [None]:
    #model.train(input_fn=lambda : load_data('meter1.csv'), steps=100)

In [None]:
predictions = model.predict(input_fn=lambda : load_data2('meter1Test.csv'))

In [None]:
results   = list(predictions);
#tf.logging.info(results)
#results

In [None]:
# get predicted values from list
pred = []
for i in results:
    pred.append( i['predictions'])



In [None]:
# get true values
true_values=pd.read_csv('meter1Test.csv',header=None)[0].values

In [None]:
plt.plot(pred,"-g" ,label='Predictions')
#plt.plot(true_values, "-r",label='True Values')
plt.legend(loc="upper left")
plt.title('Model Prediction ')
plt.ylabel('meter readinds')
plt.xlabel('Iterations')

In [None]:
#plt.plot(predictions,"-g" ,label='Predictions')
plt.plot(true_values, "-r",label='True Values',color='orange')
plt.legend(loc="upper left")
plt.title('True Values ')
plt.ylabel('meter readinds')
plt.xlabel('Iterations')

#### Plot per Building

In [None]:
from utility import utils
%load_ext autoreload
%autoreload 2

In [None]:
test_predicted = pd.DataFrame(list(zip(test_building_id,test_timestamp,pred)),columns= ['building_id','timestamp', 'meter_reading' ,] )

In [None]:
true_readings = pd.DataFrame(list(zip(test_building_id,test_timestamp,list(true_values))),columns= ['building_id','timestamp', 'meter_reading' ,] )

In [None]:
utils.plot_meter(true_readings,test_predicted,n_plots=5)

# save model

In [None]:
checkpoint_path = cfg['savedModel']['myname']['Dir3']

In [None]:
featuresample = list(dataset.take(1))[0][0]

In [None]:
# input function
inut_receive_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(featuresample)

In [None]:
model.export_saved_model(export_dir_base=checkpoint_path ,serving_input_receiver_fn=inut_receive_fn)