In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 20,10

from sklearn import linear_model


## Predictor Variables

In [2]:
# Commodity Prices
dfCommodity = pd.read_csv('../data/commodityPrices.csv')
dfCommodity['date'] = pd.to_datetime(dfCommodity['date'])
dfCommodity = dfCommodity.set_index('date').sort_index()

# Wind Generation
dfWind = pd.read_csv('../data/MISOWindGeneration.csv')
dfWind['date'] = pd.to_datetime(dfWind['date'])
dfWind = dfWind.set_index('date').sort_index()

# Demand
dfLoad = pd.read_csv('../data/MISOActualLoad.csv')
dfLoad['Market Day'] = pd.to_datetime(dfLoad['Market Day'])
dfLoad = dfLoad.set_index('Market Day').sort_index()
dfLoad.index.names = ['date']
dfLoadActual = dfLoad[['Central ActualLoad (MWh)', 'East ActualLoad (MWh)', 'MISO ActualLoad (MWh)', 'Midwest ISO ActualLoad (MWh)', 'North ActualLoad (MWh)', 'South ActualLoad (MWh)', 'West ActualLoad (MWh)']]
dfLoadActual = dfLoadActual.fillna(0)    # Handle NaN

In [3]:
# Merge into a single DataFrame
dfX = pd.merge(dfCommodity, dfWind, left_index=True, right_index=True)
dfX = pd.merge(dfX, dfLoadActual, left_index=True, right_index=True)
dfX.head()

Unnamed: 0_level_0,Central Appalachia,Northern Appalachia,Illinois Basin,Powder River Basin,Uinta Basin,NgPrice,windGenerationMWh,Central ActualLoad (MWh),East ActualLoad (MWh),MISO ActualLoad (MWh),Midwest ISO ActualLoad (MWh),North ActualLoad (MWh),South ActualLoad (MWh),West ActualLoad (MWh)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-07-06,53.333333,51.166667,44,9,44.5,3.355,325.927083,21654.299167,28127.989583,0,62493.1925,0,0,12710.90375
2009-07-07,53.5,52.0,44,9,44.5,3.3,511.05625,22956.574167,27699.694167,0,63508.234167,0,0,12851.965833
2009-07-08,53.5,52.0,44,9,44.5,3.298571,728.637083,22402.2125,27335.640833,0,62295.66125,0,0,12557.807917
2009-07-09,53.5,52.0,44,9,44.5,3.297143,1716.773333,23165.5275,28635.972917,0,64392.903333,0,0,12591.402917
2009-07-10,53.5,52.0,44,9,44.5,3.295714,837.547917,23676.907917,29816.782083,0,66701.584583,0,0,13207.894583


## Response Variable

In [4]:
dfMiso = pd.read_hdf('../data/LMP.h5')

In [14]:
dfMiso[5:].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type,meanPrice
Node,date,Unnamed: 2_level_1,Unnamed: 3_level_1
AEBN,20050616,Interface,-0.393333
AEBN,20050617,Interface,28.97375
AEBN,20050617,Interface,1.207917
AEBN,20050617,Interface,-0.26625
AEBN,20050618,Interface,17.643333


## Data Prep before Regression

In [26]:
%%time

# Collapse MultiIndex of dfY(to prep for merge with dfX)
dfY = dfMiso.dropna()                                   # Drop rows with NA
dfY = dfY.reset_index()                                 # collapse MultiIndex
dfY = dfY[['date', 'meanPrice']].set_index('date')
dfY.index = pd.to_datetime(dfY.index, format='%Y%m%d')  # expensive operation (parsing 20,000,000 dates)

CPU times: user 24.4 s, sys: 6.51 s, total: 30.9 s
Wall time: 33.4 s


In [28]:
dfY.head()

Unnamed: 0_level_0,meanPrice
date,Unnamed: 1_level_1
2005-06-16,29.46375
2005-06-16,-0.947083
2005-06-16,-0.393333
2005-06-17,28.97375
2005-06-17,1.207917


In [33]:
# Inner Join on 20,000,000 rows! (3.45 secs)
df = pd.merge(dfY, dfX, left_index=True, right_index=True, how='inner')
df = df[:'2013-09-01']

# Plot (Do not plot this! Takes too long. Looks the same as the regular plots)
# df.plot()

## Random Forest Regression (All Nodes)
Get a feel of training time for 20 million data points

In [71]:
%%time

from sklearn.cross_validation import train_test_split
from sklearn import grid_search

# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('meanPrice', axis=1), df['meanPrice'], 
                                                    test_size=0.2, random_state=0)

# Gridsearch
params = {
        'n_estimators': [5,10,20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None,1,2,3],
        'min_samples_split': [2,3]
}
clf = grid_search.GridSearchCV(RandomForestRegressor(n_jobs=3), params, cv=5, n_jobs=3)
fit = clf.fit(df.drop('meanPrice', axis=1), df['meanPrice'])

print 'Best Params: ', fit.best_params_
print 'Best Score: ', fit.best_score_
print 'Grid Scores: ', fit.grid_scores_
print

Best Params:  {'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 10, 'max_depth': 3}
Best Score:  0.00893222494516
Grid Scores:  [mean: -0.00488, std: 0.01117, params: {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 5, 'max_depth': None}, mean: -0.00530, std: 0.01085, params: {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 10, 'max_depth': None}, mean: -0.00700, std: 0.01486, params: {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 20, 'max_depth': None}, mean: -0.00872, std: 0.01326, params: {'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 5, 'max_depth': None}, mean: -0.00791, std: 0.01273, params: {'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 10, 'max_depth': None}, mean: -0.00598, std: 0.01216, params: {'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 20, 'max_depth': None}, mean: 0.00482, std: 0.00642, params: {'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimat

In [72]:
from sklearn import metrics

# Best Estimator
rf = fit.best_estimator_

# Metrics
preds = rf.predict(X_test)
print 'R2 Score: ', metrics.r2_score(y_test.values, preds)
print 'Explained Variance Score: ', metrics.explained_variance_score(y_test.values, preds)
print 'MAE: ', metrics.mean_absolute_error(y_test.values, preds)
print 'MSE: ', metrics.mean_squared_error(y_test.values, preds)
print 'Median AE: ', metrics.median_absolute_error(y_test.values, preds)

# Feature Importances
lcols = X_train.columns
pd.DataFrame(zip(lcols, rf.feature_importances_), columns=['Predictors', 'Feature Importances']).sort('Feature Importances', ascending=False)

R2 Score:  0.0146620784467
Explained Variance Score:  0.0146622420278
MAE:  13.5621520947
MSE:  270.330379536
Median AE:  10.8706173145




Unnamed: 0,Predictors,Feature Importances
7,Central ActualLoad (MWh),0.32834
10,Midwest ISO ActualLoad (MWh),0.243047
13,West ActualLoad (MWh),0.113083
8,East ActualLoad (MWh),0.106345
5,NgPrice,0.054134
6,windGenerationMWh,0.052023
2,Illinois Basin,0.03171
0,Central Appalachia,0.025772
3,Powder River Basin,0.019061
4,Uinta Basin,0.013347


In [45]:
%%time

from sklearn.cross_validation import train_test_split
from sklearn import grid_search

# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('meanPrice', axis=1), df['meanPrice'], 
                                                    test_size=0.2, random_state=0)

# Gridsearch
params = {
    'n_estimators': [10,30,50]
}
clf = grid_search.GridSearchCV(RandomForestRegressor(n_jobs=3), params, cv=5, n_jobs=3)
fit = clf.fit(df.drop('meanPrice', axis=1), df['meanPrice'])

print 'Best Params: ', fit.best_params_
print 'Best Score: ', fit.best_score_
print 'Grid Scores: ', fit.grid_scores_
print

Best Params:  {'n_estimators': 10}
Best Score:  -0.00526127482102
Grid Scores:  [mean: -0.00526, std: 0.00969, params: {'n_estimators': 10}, mean: -0.00611, std: 0.01253, params: {'n_estimators': 30}, mean: -0.00647, std: 0.01391, params: {'n_estimators': 50}]

CPU times: user 4min 54s, sys: 26.9 s, total: 5min 21s
Wall time: 50min 18s


In [70]:
from sklearn import metrics

# Best Estimator
rf = fit.best_estimator_

# Metrics
preds = rf.predict(X_test)
print 'R2 Score: ', metrics.r2_score(y_test.values, preds)
print 'Explained Variance Score: ', metrics.explained_variance_score(y_test.values, preds)
print 'MAE: ', metrics.mean_absolute_error(y_test.values, preds)
print 'MSE: ', metrics.mean_squared_error(y_test.values, preds)
print 'Median AE: ', metrics.median_absolute_error(y_test.values, preds)

# Feature Importances
lcols = X_train.columns
pd.DataFrame(zip(lcols, rf.feature_importances_), columns=['Predictors', 'Feature Importances']).sort('Feature Importances', ascending=False)

R2 Score:  0.0319441113884
Explained Variance Score:  0.031944596342
MAE:  13.5401566805
MSE:  265.589002571
Median AE:  11.3509943695




Unnamed: 0,Predictors,Feature Importances
10,Midwest ISO ActualLoad (MWh),0.344409
13,West ActualLoad (MWh),0.143422
6,windGenerationMWh,0.137001
5,NgPrice,0.105353
0,Central Appalachia,0.075182
7,Central ActualLoad (MWh),0.062301
8,East ActualLoad (MWh),0.039073
3,Powder River Basin,0.032448
4,Uinta Basin,0.022525
1,Northern Appalachia,0.022191


In [34]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn import grid_search

clf = grid_search.GridSearchCV(RandomForestRegressor(n_jobs=3), {
        'n_estimators': [30,50],
#         'max_features': ['auto', 'sqrt', 'log2'],
#         'n_estimators': [30,50,75,100],
#         'max_depth': [None,1,2,3,5],
#         'min_samples_split': [2,3,5]
    })
fit = clf.fit(df.drop('meanPrice', axis=1), df['meanPrice'])

print 'Best Params: ', fit.best_params_
print 'Best Score: ', fit.best_score_
print 'Grid Scores: ', fit.grid_scores_
print

Params:  {'n_estimators': 30}
Score:  -0.0311647480664
CPU times: user 1h 6min 12s, sys: 49.4 s, total: 1h 7min 1s
Wall time: 25min 38s


In [36]:
# Best Estimator
rf = fit.best_estimator_
print 'Score: ', rf.score(df.drop('meanPrice', axis=1), df['meanPrice'])

Score:  0.0322719992296


In [37]:
# Regression Coefficients
lcols = df.drop('meanPrice', axis=1).columns
pd.DataFrame(zip(lcols, rf.feature_importances_), columns=['Predictors', 'Feature Importances']).sort('Feature Importances', ascending=False)

  app.launch_new_instance()


Unnamed: 0,Predictors,Feature Importances
10,Midwest ISO ActualLoad (MWh),0.349451
13,West ActualLoad (MWh),0.144349
6,windGenerationMWh,0.139516
5,NgPrice,0.117725
0,Central Appalachia,0.065489
7,Central ActualLoad (MWh),0.058968
8,East ActualLoad (MWh),0.038283
3,Powder River Basin,0.030924
4,Uinta Basin,0.019738
1,Northern Appalachia,0.019021


## Random Forest Regressor (Single Node)
*As reference, do not run*

In [80]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn import grid_search

clf = grid_search.GridSearchCV(RandomForestRegressor(n_jobs=3), {
        'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [30,50,75,100],
        'max_depth': [None,1,2,3,5],
        'min_samples_split': [2,3,5]
    })
fit = clf.fit(df.drop('meanPrice', axis=1), df['meanPrice'])

print 'Params: ', fit.best_params_
print 'Score: ', fit.best_score_

Params:  {'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 75, 'max_depth': 2}
Score:  0.0150961717169
CPU times: user 2min 22s, sys: 6.85 s, total: 2min 29s
Wall time: 3min 4s


In [81]:
# Best Estimator
rf = fit.best_estimator_a
print 'Score: ', rf.score(df.drop('meanPrice', axis=1), df['meanPrice'])

Score:  0.0327399504348


In [84]:
# Regression Coefficients
lcols = df.drop('meanPrice', axis=1).columns
pd.DataFrame(zip(lcols, rf.feature_importances_), columns=['Predictors', 'Feature Importances']).sort('Feature Importances', ascending=False)

  app.launch_new_instance()


Unnamed: 0,Predictors,Feature Importances
5,NgPrice,0.583761
7,Central ActualLoad (MWh),0.141623
2,Illinois Basin,0.0968
10,Midwest ISO ActualLoad (MWh),0.089572
1,Northern Appalachia,0.042199
13,West ActualLoad (MWh),0.020678
8,East ActualLoad (MWh),0.014032
0,Central Appalachia,0.007726
6,windGenerationMWh,0.003608
3,Powder River Basin,0.0
