In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
import matplotlib.dates as md
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

In [2]:
fname = 'sfa.csv'

In [3]:
data = pd.read_csv(fname, parse_dates='time', infer_datetime_format=True)

In [4]:
data = pd.concat([data, pd.get_dummies(data.Day)] , axis=1)

In [5]:
data = pd.concat([data, pd.get_dummies(data.Time)] , axis=1)

In [6]:
sp = 60
data['td'] = data.temp - sp

In [7]:
data.head(3)

Unnamed: 0,time,totalKW,subpanels,ahu,temp,Day,Time,Fri,Mon,Sat,...,15,16,17,18,19,20,21,22,23,td
0,9/2/15 21:00,746.116169,379.199472,187.62476,69.66,Wed,21,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.66
1,9/2/15 22:00,831.855288,338.795217,226.147018,68.12,Wed,22,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.12
2,9/2/15 23:00,764.078775,385.585081,238.396111,67.25,Wed,23,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.25


In [8]:
data.loc[data['td'] <= 0, 'td'] = 0

In [9]:
data = data[data['totalKW'] > 400]

In [10]:
x = data[['Fri','Mon', 'Sat',
             'Sun',       'Thu',       'Tue',       'Wed',           0,
                 1,           2,           3,           4,           5,
                 6,           7,           8,           9,          10,
                11,          12,          13,          14,          15,
                16,          17,          18,          19,          20,
                21,          22,          23,        'td']]

In [11]:
y = data['totalKW']

In [12]:
lm = LinearRegression()

In [38]:
lm.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [39]:
lm.score(x,y)

0.50271140575820317

In [13]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42)

In [14]:
lm.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
lm.score(x_train, y_train)

0.49136508828430303

In [16]:
y_predicted = lm.predict(x_test)

In [None]:
df = pd.DataFrame({'Predicted': y_predicted[:], 'Actuals': y_test, 'Date':x_test})

In [None]:
df.to_csv('SantaFePrediction.csv')

In [17]:
r2_score(y_test, y_predicted)

0.51450658093765911

In [18]:
res = y_test - y_predicted

In [19]:
plt.figure(figsize=(9,9))
stats.probplot(res, dist="norm", plot=plt)

((array([ -3.13048135e+00,  -2.86002412e+00,  -2.70874502e+00,
          -2.60187527e+00,  -2.51838938e+00,  -2.44946088e+00,
          -2.39051505e+00,  -2.33886360e+00,  -2.29278774e+00,
          -2.25111962e+00,  -2.21302827e+00,  -2.17790069e+00,
          -2.14527126e+00,  -2.11477769e+00,  -2.08613238e+00,
          -2.05910305e+00,  -2.03349932e+00,  -2.00916313e+00,
          -1.98596175e+00,  -1.96378263e+00,  -1.94252944e+00,
          -1.92211910e+00,  -1.90247941e+00,  -1.88354725e+00,
          -1.86526708e+00,  -1.84758977e+00,  -1.83047163e+00,
          -1.81387366e+00,  -1.79776085e+00,  -1.78210170e+00,
          -1.76686773e+00,  -1.75203310e+00,  -1.73757430e+00,
          -1.72346989e+00,  -1.70970023e+00,  -1.69624731e+00,
          -1.68309456e+00,  -1.67022668e+00,  -1.65762957e+00,
          -1.64529015e+00,  -1.63319626e+00,  -1.62133665e+00,
          -1.60970078e+00,  -1.59827887e+00,  -1.58706174e+00,
          -1.57604082e+00,  -1.56520807e+00,  -1.554555