In [1]:
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
class ProcessingData:
    
    def __init__(self,filepath):
        with open(filepath, 'rb') as f:
            u = pickle._Unpickler(f)
            u.encoding = 'latin1'
            self.data = u.load()

    def fix_data(self, interval):
        """Fixes up the data. Makes sure we count two stage as single stage actions, don't count float actions,
        converts action duration and dt to floats, fill's nan's in action_duration and drops all datapoints which
        don't have dt equal to interval.
        :param data:
        :param interval: float:minutes"""
        def f(x):
            if x == 0:
                return 0
            elif x == 2 or x == 5:
                return 2
            elif x ==1 or x == 3:
                return 1

        def h(x):
            if x == 1:
                return 1
            else:
                return 0

        def c(x):
            if x == 2:
                return 1
            else:
                return 0

        self.data["action"] = self.data["action"].map(f)
        self.data['action_heating'] = self.data["action"].map(h)
        self.data['action_cooling'] = self.data['action'].map(c)

        #print self.data.head()

        return self.data, self.data[self.data["dt"] == interval]
    
    def filter_data(self):
        self.data = self.data.drop(['dt', 'action', 'previous_action', 'action_duration', \
                                    'zone_temperatureHVAC_Zone_Shelter_Corridor'], axis=1)
        return self.data
    
    def drop_nan(self):
        self.data = self.data.dropna()
        return self.data


In [3]:
training = ProcessingData("../../Data/avenal-animal-shelter_training_data.pkl")
training_data = training.fix_data(5)
training_data = training.filter_data()
training_data = training.drop_nan()
training_data.head()

Unnamed: 0_level_0,t_next,t_in,t_out,occ,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-04-18 23:16:42+00:00,69.5,69.483333,66.4997,0.0,0,0
2018-04-18 23:21:42+00:00,69.5,69.5,66.4997,0.0,0,0
2018-04-18 23:26:42+00:00,69.5,69.5,66.520546,0.0,0,0
2018-04-18 23:31:42+00:00,69.4,69.5,66.583083,0.0,0,0
2018-04-18 23:36:42+00:00,69.5,69.4,66.666466,0.0,0,0


In [4]:
testing = ProcessingData("../../Data/avenal-animal-shelter_test_data.pkl")
testing_data = testing.fix_data(5)
testing_data = testing.filter_data()
testing_data = testing.drop_nan()
testing_data.head()

Unnamed: 0_level_0,t_next,t_in,t_out,occ,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-20 00:00:00+00:00,72.8,72.8,67.778613,0.0,0,0
2018-03-20 00:05:00+00:00,73.0,72.8,67.807436,0.0,0,0
2018-03-20 00:10:00+00:00,73.0,73.0,67.835998,0.0,0,0
2018-03-20 00:15:00+00:00,73.0,73.0,67.864038,0.0,0,0
2018-03-20 00:20:00+00:00,73.0,73.0,67.891817,0.0,0,0


In [5]:
train_y = training_data['t_next']
train_X = training_data.drop(['t_next'], axis=1)
train_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-18 23:16:42+00:00,69.483333,66.4997,0.0,0,0
2018-04-18 23:21:42+00:00,69.5,66.4997,0.0,0,0
2018-04-18 23:26:42+00:00,69.5,66.520546,0.0,0,0
2018-04-18 23:31:42+00:00,69.5,66.583083,0.0,0,0
2018-04-18 23:36:42+00:00,69.4,66.666466,0.0,0,0


In [6]:
test_y = testing_data['t_next']
test_X = testing_data.drop(['t_next'], axis=1)
test_X.head()

Unnamed: 0_level_0,t_in,t_out,occ,action_heating,action_cooling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-03-20 00:00:00+00:00,72.8,67.778613,0.0,0,0
2018-03-20 00:05:00+00:00,72.8,67.807436,0.0,0,0
2018-03-20 00:10:00+00:00,73.0,67.835998,0.0,0,0
2018-03-20 00:15:00+00:00,73.0,67.864038,0.0,0,0
2018-03-20 00:20:00+00:00,73.0,67.891817,0.0,0,0


In [7]:
from sklearn.linear_model import LinearRegression

linearModel = LinearRegression().fit(train_X, train_y)
test_pred = linearModel.predict(test_X)
test_pred

array([ 72.78514218,  72.78522752,  72.98370678, ...,  72.34841732,
        72.44761467,  72.54681201])

In [8]:
rmse = sqrt(mean_squared_error(test_y,test_pred))
rmse

0.8619954953985445

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_X)
train_scaled = scaler.transform(train_X)
test_scaled = scaler.transform(test_X)
test_scaled

array([[-0.0779381 , -0.81774339, -0.32733123, -0.0993657 , -0.3962286 ],
       [-0.0779381 , -0.81532876, -0.32733123, -0.0993657 , -0.3962286 ],
       [-0.01781605, -0.81293601, -0.32733123, -0.0993657 , -0.3962286 ],
       ..., 
       [-0.22824323,  0.85894007, -0.32733123, -0.0993657 , -0.3962286 ],
       [-0.1981822 ,  0.85894007, -0.32733123, -0.0993657 , -0.3962286 ],
       [-0.16812118,  0.85894007, -0.32733123, -0.0993657 , -0.3962286 ]])

In [10]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
pca.fit(train_scaled)
train_pca = pca.transform(train_scaled)
test_pca = pca.transform(test_scaled)
                         
print("Training Data after PCA: ", train_pca)

Training Data after PCA:  [[ -1.47399678e+00   2.00411700e-01  -2.06632610e-01  -2.02723524e-03
    1.71563409e-01]
 [ -1.47105445e+00   1.98215972e-01  -2.06510609e-01  -1.77789288e-03
    1.68165439e-01]
 [ -1.46988990e+00   1.97873292e-01  -2.06305686e-01  -1.76112974e-03
    1.69403865e-01]
 ..., 
 [ -9.72744169e-01   9.14800489e-02  -1.06950900e-01  -8.14587171e-04
    8.93898314e-01]
 [ -9.02127786e-01   3.87824814e-02  -1.04022814e-01   5.16963439e-03
    8.12347396e-01]
 [ -8.66819594e-01   1.24336976e-02  -1.02558771e-01   8.16174517e-03
    7.71571936e-01]]


In [11]:
from sklearn.linear_model import LinearRegression

linearModel = LinearRegression().fit(train_pca, train_y)
test_pred = linearModel.predict(test_pca)
rmse = sqrt(mean_squared_error(test_y,test_pred))
rmse

0.8619954953985456