# Autoregression Model

In [3]:
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt

# Loading Data File

In [4]:
path = "../Data/"
building = "avenal-animal-shelter"
interval = 5 # min


with open(path + building + "_training_data.pkl", 'r') as f:
    training_data = pickle.load(f)

with open(path + building + "_test_data.pkl", "r") as f:
    test_data = pickle.load(f)
    

# Features in Dataset

In [5]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 53187 entries, 2018-04-18 23:16:42+00:00 to 2018-10-25 23:13:42+00:00
Data columns (total 9 columns):
action                                        53187 non-null float64
action_duration                               53187 non-null object
dt                                            53187 non-null float64
previous_action                               42307 non-null float64
t_next                                        53187 non-null float64
t_in                                          53187 non-null float64
t_out                                         53186 non-null float64
occ                                           53186 non-null float64
zone_temperatureHVAC_Zone_Shelter_Corridor    53186 non-null float64
dtypes: float64(8), object(1)
memory usage: 4.1+ MB


# Include previous indoor temperature as t_last

In [6]:
def add_last_temperature_feature(data):
    """Adding a feature which specifies what the previous temperature was "dt" seconds before the current
    datasample. Since data does not need be continious, we need a loop.
    :param: pd.df with cols: "t_in", "dt" and needs to be sorted by time.
    returns pd.df with cols "t_last" added. """

    last_temps = []

    last_temp = None
    curr_time = data.index[0]
    for index, row in data.iterrows():

        if last_temp is None:
            last_temps.append(row["t_in"])  # so the feature will be zero instead
        else:
            last_temps.append(last_temp)

        if curr_time == index:
            last_temp = row["t_in"]
            curr_time += datetime.timedelta(minutes=row["dt"])
        else:
            last_temp = None
            curr_time = index + datetime.timedelta(minutes=row["dt"])

    data["t_last"] = np.array(last_temps)
    return data

training_data = add_last_temperature_feature(training_data)
test_data = add_last_temperature_feature(test_data)

training_data.head()

Unnamed: 0_level_0,action,action_duration,dt,previous_action,t_next,t_in,t_out,occ,zone_temperatureHVAC_Zone_Shelter_Corridor,t_last
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-04-18 23:16:42+00:00,0.0,300.0s,5.0,,69.5,69.483333,66.4997,0.0,69.499167,69.483333
2018-04-18 23:21:42+00:00,0.0,600.0s,5.0,,69.5,69.5,66.4997,0.0,69.485833,69.483333
2018-04-18 23:26:42+00:00,0.0,900.0s,5.0,,69.5,69.5,66.520546,0.0,69.460833,69.5
2018-04-18 23:31:42+00:00,0.0,1200.0s,5.0,,69.4,69.5,66.583083,0.0,69.45,69.5
2018-04-18 23:36:42+00:00,0.0,1500.0s,5.0,,69.5,69.4,66.666466,0.0,69.450489,69.5


# Preprocessing Data

Action features has values 0,1,2,3,4,5 which is transformed to values:
0 - no action
1 - heating
2 - cooling

Select the rows in datasets such that interval time is constant (5 minutes)

In [7]:
def fix_data(data, interval):
    """Fixes up the data. Makes sure we count two stage as single stage actions, don't count float actions,
    converts action duration and dt to floats, fill's nan's in action_duration and drops all datapoints which
    don't have dt equal to interval.
    :param data:
    :param interval: float:minutes"""
    def f(x):
        if x == 0:
            return 0
        elif x == 2 or x == 5:
            return 2
        elif x ==1 or x == 3:
            return 1
        
    def h(x):
        if x == 1:
            return 1
        else:
            return 0
        
    def c(x):
        if x == 2:
            return 1
        else:
            return 0

    data["action"] = data["action"].map(f)
    data['action_heating'] = data["action"].map(h)
    data['action_cooling'] = data['action'].map(c)
    
    print data.head()

    return data, data[data["dt"] == interval]

# get preproccessed test and training data
org_training_data, training_data = fix_data(training_data, interval)
org_test_data, test_data = fix_data(test_data, interval)


#print(test_data.head())

                           action action_duration   dt  previous_action  \
time                                                                      
2018-04-18 23:16:42+00:00       0          300.0s  5.0              NaN   
2018-04-18 23:21:42+00:00       0          600.0s  5.0              NaN   
2018-04-18 23:26:42+00:00       0          900.0s  5.0              NaN   
2018-04-18 23:31:42+00:00       0         1200.0s  5.0              NaN   
2018-04-18 23:36:42+00:00       0         1500.0s  5.0              NaN   

                           t_next       t_in      t_out  occ  \
time                                                           
2018-04-18 23:16:42+00:00    69.5  69.483333  66.499700  0.0   
2018-04-18 23:21:42+00:00    69.5  69.500000  66.499700  0.0   
2018-04-18 23:26:42+00:00    69.5  69.500000  66.520546  0.0   
2018-04-18 23:31:42+00:00    69.4  69.500000  66.583083  0.0   
2018-04-18 23:36:42+00:00    69.5  69.400000  66.666466  0.0   

                         

# Modifying action feature

Action is represnted as two separate features: action_heating and action_cooling, which holds value either 0(off) or 1(on). 

# Dropping Features

In [8]:
def filter_data(data):
    data = data.drop(['dt', 'action', 'previous_action', 'action_duration', 'zone_temperatureHVAC_Zone_Shelter_Corridor'], axis=1)
        
    return data

training_data = filter_data(training_data)
test_data = filter_data(test_data)

print(training_data.head())
print(test_data.head())

                           t_next       t_in      t_out  occ     t_last  \
time                                                                      
2018-04-18 23:16:42+00:00    69.5  69.483333  66.499700  0.0  69.483333   
2018-04-18 23:21:42+00:00    69.5  69.500000  66.499700  0.0  69.483333   
2018-04-18 23:26:42+00:00    69.5  69.500000  66.520546  0.0  69.500000   
2018-04-18 23:31:42+00:00    69.4  69.500000  66.583083  0.0  69.500000   
2018-04-18 23:36:42+00:00    69.5  69.400000  66.666466  0.0  69.500000   

                           action_heating  action_cooling  
time                                                       
2018-04-18 23:16:42+00:00               0               0  
2018-04-18 23:21:42+00:00               0               0  
2018-04-18 23:26:42+00:00               0               0  
2018-04-18 23:31:42+00:00               0               0  
2018-04-18 23:36:42+00:00               0               0  
                           t_next  t_in      t_out  oc

# Separating Target features 

In [9]:
def getTarget(data, target):
    y = data[target]
    return y

train_y = getTarget(training_data, 't_next')
train_y.head()

time
2018-04-18 23:16:42+00:00    69.5
2018-04-18 23:21:42+00:00    69.5
2018-04-18 23:26:42+00:00    69.5
2018-04-18 23:31:42+00:00    69.4
2018-04-18 23:36:42+00:00    69.5
Name: t_next, dtype: float64

In [10]:
def getFeatures(data, target):
    X = data.drop([target], axis=1)
    return X

train_X = getFeatures(training_data, 't_next')
train_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,t_last,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-04-18 23:16:42+00:00,69.483333,66.4997,0.0,69.483333,0,0
2018-04-18 23:21:42+00:00,69.5,66.4997,0.0,69.483333,0,0
2018-04-18 23:26:42+00:00,69.5,66.520546,0.0,69.5,0,0
2018-04-18 23:31:42+00:00,69.5,66.583083,0.0,69.5,0,0
2018-04-18 23:36:42+00:00,69.4,66.666466,0.0,69.5,0,0


In [11]:
test_y = getTarget(test_data, 't_next')
test_y.head()

time
2018-03-20 00:00:00+00:00    72.8
2018-03-20 00:05:00+00:00    73.0
2018-03-20 00:10:00+00:00    73.0
2018-03-20 00:15:00+00:00    73.0
2018-03-20 00:20:00+00:00    73.0
Name: t_next, dtype: float64

In [12]:
test_X = getFeatures(test_data, 't_next')
test_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,t_last,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-20 00:00:00+00:00,72.8,67.778613,0.0,72.8,0,0
2018-03-20 00:05:00+00:00,72.8,67.807436,0.0,72.8,0,0
2018-03-20 00:10:00+00:00,73.0,67.835998,0.0,72.8,0,0
2018-03-20 00:15:00+00:00,73.0,67.864038,0.0,73.0,0,0
2018-03-20 00:20:00+00:00,73.0,67.891817,0.0,73.0,0,0


# Linear Regression Model

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import time 

linearModel = LinearRegression().fit(train_X, train_y)
test_pred1 = linearModel.predict(test_X)
print test_pred1


[72.78808106 72.78814943 73.04479567 ... 72.33896613 72.339024
 72.46731324]


In [14]:
linearModel.score(test_X, test_y)

0.992678955494629

# Lasso Regression

In [15]:
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1)
clf.fit(train_X, train_y)
clf.score(test_X, test_y)

0.9965047187196565

# AutoRegression that uses previous predicted value as current t_in

In [21]:
def autoRegression2(model, data_X):
    t_pred = []
    data_pred2 = []
    for index, row in data_X.iterrows():
        if len(t_pred) >= 1 :
            row['t_in'] = t_pred[0]
        t_pred = model.predict([row])
        data_pred2.append(t_pred[0])    
        #print t_pred[0]
    return data_pred2       
        

4.453983703435345

# AutoRegression using Lasso Regression model

In [None]:
train_pred2 = autoRegression2(clf, train_X)
mean_squared_error(train_y, train_pred2) 

In [22]:
test_pred2 = autoRegression2(clf, test_X)
mean_squared_error(test_y, test_pred2)

7.922641626974842

# AutoRegression using Linear Regression Model (Error: Gives negative infinity values)

In [23]:
train_pred2 = autoRegression2(linearModel, train_X)
mean_squared_error(train_y, train_pred2) 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
test_pred2 = autoRegression2(linearModel, test_X)
mean_squared_error(test_y, test_pred2)

# AutoRegression to predict at successive time intervals

At time t1, Temperature is predicted for 5 minute interval (t1+5) using the linear model . This predicted temperature is set as indoor temperature (t_in) to predict the temperature in the next 5 minute interval (t1+5+5). This process is repeated until the predicted temperature is for next one hour(t1+60).

Model is evaluated using Root mean squared error. To get true predictions at successive times, the test dataset(test_y) is shifted to n number of 5 minute intervals. 

In [None]:
import pandas as pd

def autoRegression(linearModel, test_X, time):
    next_pred5 = []
    n = time/5
    
    for index, row in test_X.iterrows():
        for i in range(1,n+1):
            t_next5 = linearModel.predict([row])
            row['t_in'] = t_next5[0]            
        next_pred5.append(t_next5[0])
        
    test_pred = pd.DataFrame(next_pred5, index=test_X.index)
    next_test_y = test_y.shift(1-n,axis=0)
    nan_index = next_test_y.index[next_test_y.apply(np.isnan)]
    next_test_y = next_test_y.dropna()
    test_pred = test_pred.drop(nan_index) 
    rmse = mean_squared_error(next_test_y, test_pred)
    
    plt.figure(figsize=(15,4))
    plt.plot(next_test_y, 'g', linewidth=1)
    plt.plot(test_pred, 'r', linewidth=1)
    
    #print "time=", time, "rmse=", rmse
    return rmse
    


In [None]:
autoRegression(linearModel, test_X, 10)

In [None]:
time = []
rmse = []
for i in range(5,65,5):
    rmse_i = autoRegression(linearModel, test_X, i)
    rmse.append(rmse_i)
    time.append(i)
    
result = pd.DataFrame(rmse, index=time)
result
    

# Plot time vs rmse

In [None]:
result.plot()