In [4]:
import datetime as date
import sys
import io
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

### Data Processing

In [5]:
#Load unprocessed data
filename= 'GatesData8-26-29Simple'
df = pd.read_csv(filename+'.csv')

In [6]:
# original data
df.head()

Unnamed: 0,Date/Time,Entry,CHW Flowrate,CHWRT,CHWST,Chilled Water Request,OA Humidity,OA Temperature,Apparent Power,3rd Floor AHU-1|SA Airflow,...,3rd Floor AHU-2|RA Humidity,3rd Floor AHU-3|RA Humidity,3rd Floor AHU-7|RA Humidity,3rd Floor AHU-8|RA Humidity,Level 2 AHU-4|RA Humidity,Level 2 AHU-5|RA Humidity,Roof AHU-10|RA Humidity,Roof AHU-11|RA Humidity,Roof AHU-12|RA Humidity,Roof AHU-9|RA Humidity
0,8/28/19 0:00,0,79.969933,53.282894,46.400002,1,90.519997,72.292,413680,1532.060059,...,45.63213,46.360001,48.876602,53.635258,56.80917,76.444534,55.450722,53.867073,53.043949,54.188179
1,8/28/19 0:01,1,144.692215,53.382935,46.400002,1,90.50808,72.292,404159,1561.681641,...,45.56287,46.351658,48.985153,53.633373,57.092518,76.419258,55.568371,53.880249,52.59951,54.193092
2,8/28/19 0:02,2,11.871678,53.400002,46.234203,1,90.490074,72.310303,408172,1597.395996,...,45.458839,46.333656,48.100964,53.639378,58.625011,76.40126,55.188408,53.900249,51.953472,54.217091
3,8/28/19 0:03,3,75.676659,53.482761,46.282761,1,90.472076,72.331894,412268,1633.110474,...,45.354813,46.315655,47.216778,53.645378,60.157501,76.383255,54.808441,53.920254,51.30743,54.241093
4,8/28/19 0:04,4,40.211014,53.5,46.217072,1,90.454071,72.353485,416364,1668.824951,...,45.250786,46.297657,46.332588,53.651382,61.689991,76.36525,54.428478,53.940254,50.661392,54.265091


In [7]:
#fix date/time column
for timestamp in df.iloc[:,0]:
    date = timestamp[:8]
    time = timestamp[8:]

df = df.drop([df.columns[0]], axis='columns')
df.insert(0,'Date',date)
df.insert(1,'Time', time)

In [8]:
df.shape

(5760, 32)

In [9]:
# drop the columns that contain all zeros
df = df[(df.T!=0).any()]
df = df.dropna(axis='columns')
df.values[:,2:].astype(float) #convert all measured values to floats

array([[0.00000000e+00, 7.99699326e+01, 5.32828941e+01, ...,
        5.38670731e+01, 5.30439491e+01, 5.41881790e+01],
       [1.00000000e+00, 1.44692215e+02, 5.33829346e+01, ...,
        5.38802490e+01, 5.25995102e+01, 5.41930924e+01],
       [2.00000000e+00, 1.18716783e+01, 5.34000015e+01, ...,
        5.39002495e+01, 5.19534721e+01, 5.42170906e+01],
       ...,
       [1.43800000e+03, 8.00000000e+00, 5.33170395e+01, ...,
        5.38490753e+01, 5.37818871e+01, 5.42421608e+01],
       [1.43900000e+03, 7.25741272e+01, 5.32170372e+01, ...,
        5.38550758e+01, 5.35359077e+01, 5.42241669e+01],
       [1.44000000e+03, 7.64735107e+01, 5.32000008e+01, ...,
        5.38610763e+01, 5.32899284e+01, 5.42061729e+01]])

In [10]:
#create new feature for runtime length
request_status = df['Chilled Water Request']
run_time = np.zeros(request_status.shape[0])
t=0

for i in range(request_status.shape[0]):
    status = request_status[i]
    if status == 0:
        t=0
    elif status==1:
        t+=status
    run_time[i] = t
    
df['Run Time'] = run_time
df.head()

Unnamed: 0,Date,Time,Entry,CHW Flowrate,CHWRT,CHWST,Chilled Water Request,OA Humidity,OA Temperature,Apparent Power,...,3rd Floor AHU-3|RA Humidity,3rd Floor AHU-7|RA Humidity,3rd Floor AHU-8|RA Humidity,Level 2 AHU-4|RA Humidity,Level 2 AHU-5|RA Humidity,Roof AHU-10|RA Humidity,Roof AHU-11|RA Humidity,Roof AHU-12|RA Humidity,Roof AHU-9|RA Humidity,Run Time
0,8/27/19,23:59,0,79.969933,53.282894,46.400002,1,90.519997,72.292,413680,...,46.360001,48.876602,53.635258,56.80917,76.444534,55.450722,53.867073,53.043949,54.188179,1.0
1,8/27/19,23:59,1,144.692215,53.382935,46.400002,1,90.50808,72.292,404159,...,46.351658,48.985153,53.633373,57.092518,76.419258,55.568371,53.880249,52.59951,54.193092,2.0
2,8/27/19,23:59,2,11.871678,53.400002,46.234203,1,90.490074,72.310303,408172,...,46.333656,48.100964,53.639378,58.625011,76.40126,55.188408,53.900249,51.953472,54.217091,3.0
3,8/27/19,23:59,3,75.676659,53.482761,46.282761,1,90.472076,72.331894,412268,...,46.315655,47.216778,53.645378,60.157501,76.383255,54.808441,53.920254,51.30743,54.241093,4.0
4,8/27/19,23:59,4,40.211014,53.5,46.217072,1,90.454071,72.353485,416364,...,46.297657,46.332588,53.651382,61.689991,76.36525,54.428478,53.940254,50.661392,54.265091,5.0


In [11]:
#saved processed data file
df.to_csv('Processed_'+filename+'.csv')

In [12]:
df.shape

(5760, 33)

### Define functions used in predictive models

In [13]:
#split into training and test
def prepare(Xraw, yraw, varexplained=None):
    X_train, X_test, y_train, y_test = train_test_split(Xraw, yraw, test_size=0.3, random_state=10)
    #normalize data for gradient descent
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, scaler
    
#PCA to reduce number of features - not used
def pca_func(varexplained, Xtrain, ytrain, Xtest, ytest):
    pca = PCA(n_components=varexplained, random_state=10)
    pca.fit(Xtrain)
    Xtrain2 = pca.transform(Xtrain)
    Xtest2 = pca.transform(Xtest)
    return Xtrain2, Xtest2, pca

# Move new target variable to last column
def target(dataframe, new_label, remove1=None, remove2=None):
    dfnew = dataframe[[c for c in df if c not in [new_label,remove1,remove2]] + [new_label]]
    X_raw = dfnew[dfnew.columns[2:-1]].values
    y_raw = dfnew[dfnew.columns[-1]].values
    return dfnew, X_raw, y_raw

#save model and load model functions
def savemodel(filename, model):
    # save the model to disk
    filename = filename + '.sav'
    pickle.dump(model, open(filename, 'wb'))

def loadmodel(filename):
    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

### Chilled Water Return Temperature Prediction

In [14]:
#process data and save scale model
df1, X_raw1, y_raw1 = target(df, 'CHWRT','Apparent Power')
X_train1, X_test1, y_train1, y_test1, CHWRT_scaler = prepare(X_raw1, y_raw1)
savemodel('CHWRT_scaler', CHWRT_scaler)

In [15]:
#perform SVR on CHWRT
CHWRT_rbf = SVR(kernel='rbf', C=1500, gamma='auto', epsilon=0.05, tol=1e-7)

CHWRT_rbf.fit(X_train1, y_train1)
CHWRT_rbf.predict(X_train1)
y_predict1 = CHWRT_rbf.predict(X_test1)

CHWRT_rbf.score(X_train1,y_train1), CHWRT_rbf.score(X_test1,y_test1), mean_squared_error(y_test1/y_test1, y_predict1/y_test1)*100

(0.9974565625800099, 0.9823040710154923, 0.0013792618714129328)

In [16]:
#save SVR model for later use
savemodel('CHWRT_prediction', CHWRT_rbf)

### Apparent Power Prediction

In [17]:
#process data and save scale model
df2, X_raw2, y_raw2 = target(df, 'Apparent Power')
X_train2, X_test2, y_train2, y_test2, power_scaler = prepare(X_raw2, y_raw2)
savemodel('Power_scaler', power_scaler)

In [18]:
#perform SVR on Apparent Power
power_rbf = SVR(kernel='rbf', C=5000, gamma='auto', epsilon=0.1, tol=1e-7)

power_rbf.fit(X_train2, y_train2)
power_rbf.predict(X_train2)
power_rbf.predict(X_test2)
y_predict2 = power_rbf.predict(X_test2)


power_rbf.score(X_train2,y_train2), power_rbf.score(X_test2,y_test2), mean_squared_error(y_test2/y_test2, y_predict2/y_test2)

(0.9754681344551837, 0.9735308759968906, 0.0007776544759563472)

In [19]:
#save SVR model for later use
savemodel('Power_prediction', power_rbf)

### Chilled Water Flow Rate Prediction

In [20]:
#process data and save scale model
df3, X_raw3, y_raw3 = target(df,'CHW Flowrate','Apparent Power')
X_train3, X_test3, y_train3, y_test3,CHWF_scaler = prepare(X_raw3, y_raw3)
savemodel('CHWF_scaler', CHWF_scaler)


In [21]:
#perform SVR on Chilled Water Flowrate
CHWF_rbf = SVR(kernel='rbf', C=140, gamma=0.6, epsilon=0.05, tol=1e-7)

CHWF_rbf.fit(X_train3, y_train3)
CHWF_rbf.predict(X_train3)
y_predict3 = CHWF_rbf.predict(X_test3)

CHWF_rbf.score(X_train3,y_train3), CHWF_rbf.score(X_test3,y_test3)

(0.9346222621309754, 0.7224781134125784)

In [22]:
#save svr model for later use
savemodel('flow_prediction', CHWF_rbf)