In [1]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns                                   # For pretty plot
from datetime import datetime, timedelta

from sklearn import datasets, linear_model
from sklearn.cross_validation import cross_val_predict

from itertools import product

%matplotlib inline
%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 50



In [2]:
import pickle

## Load DataFrame of all irradiance means

In [13]:
irradMean_df = pickle.load(open('../idaWebData/irradMean_df.p', 'rb'))

In [14]:
irradMean_df.head(10)

Unnamed: 0_level_0,time,gre000z0
stn,Unnamed: 1_level_1,Unnamed: 2_level_1
TAE,200001010000,0
TAE,200001010010,0
TAE,200001010020,0
TAE,200001010030,0
TAE,200001010040,12
TAE,200001010050,0
TAE,200001010100,0
TAE,200001010110,0
TAE,200001010120,0
TAE,200001010130,-6


In [15]:
irradMean_df.tail()

Unnamed: 0_level_0,time,gre000z0
stn,Unnamed: 1_level_1,Unnamed: 2_level_1
KLO,201512312310,2
KLO,201512312320,2
KLO,201512312330,2
KLO,201512312340,2
KLO,201512312350,2


In [16]:
irradMean_df.shape

(86541087, 2)

## Pre-processing

Some rows are the headers, remove them

In [20]:
irradMean_df = irradMean_df[irradMean_df.time != 'time']

In [18]:
irradMean_df.shape

(86535644, 2)

Convert time column into datetime

In [21]:
irradMean_df['time'] = pd.to_datetime(irradMean_df['time'], format='%Y%m%d%H%M')

In [None]:
irradMean_df.head()

Remove missing measurements ("-")

In [22]:
irradMean_df = irradMean_df[irradMean_df.gre000z0 != '-']

In [None]:
irradMean_df.shape

Convert param column into float

In [23]:
irradMean_df.gre000z0 = pd.to_numeric(irradMean_df.gre000z0)

In [None]:
type(irradMean_df.iloc[0]['gre000z0'])

Remove noise from zero measurements (negative irradiance makes no sense)

In [24]:
irradMean_df.loc[irradMean_df.gre000z0 < 0, 'gre000z0'] = 0

## Grouping

For each station, we want the average for each month (over all years)

Firstly, we transform the time column into month only.

In [25]:
irradMean_df.time = irradMean_df.time.dt.month

In [None]:
irradMean_df.head()

And then we group by station and month

In [26]:
irradMean_df.reset_index(inplace=True)

In [None]:
irradMean_df.head()

In [27]:
stn_month_irrad = irradMean_df.groupby(by=['stn','time'])

In [28]:
means = stn_month_irrad.mean()

In [29]:
means

Unnamed: 0_level_0,Unnamed: 1_level_0,gre000z0
stn,time,Unnamed: 2_level_1
ABO,1,58.652412
ABO,2,97.041357
ABO,3,149.705309
ABO,4,184.841352
ABO,5,202.885559
ABO,6,224.238990
ABO,7,215.667876
ABO,8,190.617565
ABO,9,155.846557
ABO,10,102.394213


In [30]:
type(means)

pandas.core.frame.DataFrame

In [31]:
pickle.dump(means, open('groupedIrradMeans.p', 'wb'))

## Plots

Some plots

In [None]:
plt.plot(means.loc['ABO']['gre000z0'].values)

In [None]:
plt.plot(means.gre000z0.values)

## Add power from ELL model

Load trained model

In [32]:
model = pickle.load(open('model.p', 'rb'))



In [33]:
model

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

This model returns power in watts assuming that a total of 14 panels each of 255 Wp are installed. However, power potential of a station is defined as the dc power that can be produced at that station with an installation of 1Wp.

In [45]:
normalizingFactor = 14*255 # in Wp

Predict power for avg irradiation in each month

In [42]:
irradiance_s = means.gre000z0.values

In [43]:
irradiance_s

array([  58.65241178,   97.04135745,  149.70530885, ...,  117.35456507,
         68.43722767,   54.28122482])

In [46]:
power_s = model.predict(irradiance_s.reshape(-1,1))/normalizingFactor

In [47]:
power_s

array([ 0.05875999,  0.09631137,  0.14754017, ...,  0.11611023,
        0.06834802,  0.05447302])

Add power column to DataFrame

In [57]:
means['power'] = power_s

In [60]:
means.head(24)

Unnamed: 0_level_0,Unnamed: 1_level_0,gre000z0,power
stn,time,Unnamed: 2_level_1,Unnamed: 3_level_1
ABO,1,58.652412,0.05876
ABO,2,97.041357,0.096311
ABO,3,149.705309,0.14754
ABO,4,184.841352,0.181535
ABO,5,202.885559,0.198935
ABO,6,224.23899,0.219477
ABO,7,215.667876,0.211238
ABO,8,190.617565,0.187109
ABO,9,155.846557,0.153493
ABO,10,102.394213,0.101533


In [61]:
pickle.dump(means, open('groupedStations.p', 'wb'))

In [3]:
station_df = pd.read_pickle('groupedStations.p')

In [9]:
station_df.index(0)

('ABO', 1)

In [4]:
nice_station = station_df[station_df[]]

Unnamed: 0_level_0,Unnamed: 1_level_0,gre000z0,power
stn,time,Unnamed: 2_level_1,Unnamed: 3_level_1
ABO,1,58.652412,0.05876
ABO,2,97.041357,0.096311
ABO,3,149.705309,0.14754
ABO,4,184.841352,0.181535
ABO,5,202.885559,0.198935


## Using the model with Irradiance, Temp and Wind

In [64]:
model2 = pickle.load(open('model_with_temp_wind.p', 'rb'))

In [65]:
normalizingFactor = 14*255 # in Wp

In [69]:
allFeatures_df = pickle.load(open('all_features.p','rb'))

In [70]:
allFeatures_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Irradiance,Temp,Wind
stn,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABO,1,58.652412,1.335902,6.267113
ABO,2,97.041357,1.268299,5.433493
ABO,3,149.705309,3.154173,6.388994
ABO,4,184.841352,5.838967,6.862268
ABO,5,202.885559,9.307222,7.174839


In [71]:
allFeatures_df.shape

(1573, 3)

In [72]:
X = allFeatures_df.values

In [75]:
power = model.predict(X)/normalizingFactor

In [76]:
allFeatures_df['power'] = power

In [78]:
allFeatures_df.shape

(1573, 4)

In [80]:
pickle.dump(allFeatures_df, open('groupedStationsAllFeatures.p', 'wb'))

## Conclusion

Now, we can predict that for each of the following years, we have such avg power produced per month and so we can estimate the break-even time

We can use this as our baseline.

## Next

Instead of using an average of monthly values, we could train a model for each month of each station, and then use it for future predictions.