In [1]:
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
class ProcessingData:
    
    def __init__(self,filepath):
        with open(filepath, 'rb') as f:
            u = pickle._Unpickler(f)
            u.encoding = 'latin1'
            self.data = u.load()

    def fix_data(self, interval):
        """Fixes up the data. Makes sure we count two stage as single stage actions, don't count float actions,
        converts action duration and dt to floats, fill's nan's in action_duration and drops all datapoints which
        don't have dt equal to interval.
        :param data:
        :param interval: float:minutes"""
        def f(x):
            if x == 0:
                return 0
            elif x == 2 or x == 5:
                return 2
            elif x ==1 or x == 3:
                return 1

        def h(x):
            if x == 1:
                return 1
            else:
                return 0

        def c(x):
            if x == 2:
                return 1
            else:
                return 0

        self.data["action"] = self.data["action"].map(f)
        self.data['action_heating'] = self.data["action"].map(h)
        self.data['action_cooling'] = self.data['action'].map(c)

        #print self.data.head()

        return self.data, self.data[self.data["dt"] == interval]
    
    def filter_data(self):
        self.data = self.data.drop(['dt', 'action', 'previous_action', 'action_duration', \
                                    'zone_temperatureHVAC_Zone_Shelter_Corridor'], axis=1)
        return self.data
    
    def drop_nan(self):
        self.data = self.data.dropna()
        return self.data
    
    def secondOrder(self):
        for col in self.data:
            self.data[col+'(t-1)'] = self.data[col].shift(1)
        self.data = self.data.dropna()
        return self.data


In [3]:
training = ProcessingData("../../Data/avenal-animal-shelter_training_data.pkl")
training_data = training.fix_data(5)
training_data = training.filter_data()
training_data = training.secondOrder()
training_data = training.drop_nan()
training_data.head()

Unnamed: 0_level_0,t_next,t_in,t_out,occ,action_heating,action_cooling,t_next(t-1),t_in(t-1),t_out(t-1),occ(t-1),action_heating(t-1),action_cooling(t-1)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-04-18 23:21:42+00:00,69.5,69.5,66.4997,0.0,0,0,69.5,69.483333,66.4997,0.0,0.0,0.0
2018-04-18 23:26:42+00:00,69.5,69.5,66.520546,0.0,0,0,69.5,69.5,66.4997,0.0,0.0,0.0
2018-04-18 23:31:42+00:00,69.4,69.5,66.583083,0.0,0,0,69.5,69.5,66.520546,0.0,0.0,0.0
2018-04-18 23:36:42+00:00,69.5,69.4,66.666466,0.0,0,0,69.4,69.5,66.583083,0.0,0.0,0.0
2018-04-18 23:41:42+00:00,69.4,69.5,66.729007,0.0,0,0,69.5,69.4,66.666466,0.0,0.0,0.0


In [4]:
testing = ProcessingData("../../Data/avenal-animal-shelter_test_data.pkl")
testing_data = testing.fix_data(5)
testing_data = testing.filter_data()
testing_data = testing.secondOrder()
testing_data = testing.drop_nan()
testing_data.head()

Unnamed: 0_level_0,t_next,t_in,t_out,occ,action_heating,action_cooling,t_next(t-1),t_in(t-1),t_out(t-1),occ(t-1),action_heating(t-1),action_cooling(t-1)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-03-20 00:05:00+00:00,73.0,72.8,67.807436,0.0,0,0,72.8,72.8,67.778613,0.0,0.0,0.0
2018-03-20 00:10:00+00:00,73.0,73.0,67.835998,0.0,0,0,73.0,72.8,67.807436,0.0,0.0,0.0
2018-03-20 00:15:00+00:00,73.0,73.0,67.864038,0.0,0,0,73.0,73.0,67.835998,0.0,0.0,0.0
2018-03-20 00:20:00+00:00,73.0,73.0,67.891817,0.0,0,0,73.0,73.0,67.864038,0.0,0.0,0.0
2018-03-20 00:25:00+00:00,73.0,73.0,67.865168,0.0,0,0,73.0,73.0,67.891817,0.0,0.0,0.0


In [5]:
train_y = training_data['t_next']
train_X = training_data.drop(['t_next', 't_next(t-1)'], axis=1)
train_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,action_heating,action_cooling,t_in(t-1),t_out(t-1),occ(t-1),action_heating(t-1),action_cooling(t-1)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-04-18 23:21:42+00:00,69.5,66.4997,0.0,0,0,69.483333,66.4997,0.0,0.0,0.0
2018-04-18 23:26:42+00:00,69.5,66.520546,0.0,0,0,69.5,66.4997,0.0,0.0,0.0
2018-04-18 23:31:42+00:00,69.5,66.583083,0.0,0,0,69.5,66.520546,0.0,0.0,0.0
2018-04-18 23:36:42+00:00,69.4,66.666466,0.0,0,0,69.5,66.583083,0.0,0.0,0.0
2018-04-18 23:41:42+00:00,69.5,66.729007,0.0,0,0,69.4,66.666466,0.0,0.0,0.0


In [6]:
test_y = testing_data['t_next']
test_X = testing_data.drop(['t_next', 't_next(t-1)'], axis=1)
test_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,action_heating,action_cooling,t_in(t-1),t_out(t-1),occ(t-1),action_heating(t-1),action_cooling(t-1)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-03-20 00:05:00+00:00,72.8,67.807436,0.0,0,0,72.8,67.778613,0.0,0.0,0.0
2018-03-20 00:10:00+00:00,73.0,67.835998,0.0,0,0,72.8,67.807436,0.0,0.0,0.0
2018-03-20 00:15:00+00:00,73.0,67.864038,0.0,0,0,73.0,67.835998,0.0,0.0,0.0
2018-03-20 00:20:00+00:00,73.0,67.891817,0.0,0,0,73.0,67.864038,0.0,0.0,0.0
2018-03-20 00:25:00+00:00,73.0,67.865168,0.0,0,0,73.0,67.891817,0.0,0.0,0.0


In [7]:
from sklearn.linear_model import LinearRegression

linearModel = LinearRegression().fit(train_X, train_y)
test_pred = linearModel.predict(test_X)
test_pred

array([ 72.78657308,  72.98699123,  72.98487956, ...,  72.3615595 ,
        72.46150607,  72.56056317])

In [8]:
rmse = sqrt(mean_squared_error(test_y,test_pred))
rmse

0.8669073691700774

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_X)
train_scaled = scaler.transform(train_X)
test_scaled = scaler.transform(test_X)
test_scaled

array([[-0.07795843, -0.81534504, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291],
       [-0.01783629, -0.81295229, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291],
       [-0.01783629, -0.81060331, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291],
       ..., 
       [-0.22826377,  0.85892151, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291],
       [-0.1982027 ,  0.85892151, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291],
       [-0.16814163,  0.85892151, -0.32733464, ..., -0.32733464,
        -0.09936665, -0.39623291]])

In [10]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
pca.fit(train_scaled)
train_pca = pca.transform(train_scaled)
test_pca = pca.transform(test_scaled)
                         
print("Training Data after PCA: ", train_pca)

Training Data after PCA:  [[-2.09289144  0.19925652 -0.28087607  0.01052161  0.24567367]
 [-2.08991986  0.1975824  -0.28065555  0.0107784   0.24414497]
 [-2.08657535  0.19678646 -0.28008515  0.01069663  0.24764756]
 ..., 
 [-0.94416547  1.19078573 -0.12404079  1.0380168   1.1347173 ]
 [-1.32537296  0.04903533 -0.14013485 -0.01952026  1.21051678]
 [-1.24848008 -0.0041949  -0.13715756 -0.00913848  1.12433743]]


In [11]:
from sklearn.linear_model import LinearRegression

linearModel = LinearRegression().fit(train_pca, train_y)
test_pred = linearModel.predict(test_pca)
rmse = sqrt(mean_squared_error(test_y,test_pred))
rmse

0.7599748891618074