In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.metrics import mean_absolute_error
from solar_forecasting.data_sources.get_data import download_file
from transform_output_format import get_2D_output, get_4D_output

In [37]:
# Testing out python script
download_file(download_to_disk = True)

Downloaded storage object X_train_copernicus.npz from bucket train_data_solar_project to local file ../raw_data/data.npz.


In [38]:
# Testing out python script with different arguments
download_file(
    download_to_disk = True, 
    destination_file_name = '../raw_data/baseline.npz')

Downloaded storage object X_train_copernicus.npz from bucket train_data_solar_project to local file ../raw_data/baseline.npz.


In [39]:
data = np.load('../raw_data/baseline.npz')

In [41]:
data.files

['datetime', 'GHI', 'CLS', 'SZA', 'SAA']

In [42]:
G = data['GHI'][:300]
C = data['CLS'][:300]
Z = data['SZA'][:300]
A = data['SAA'][:300]

In [22]:
# Checking if values are np.float64 and stat is

total_ghi = 0
stat_ghi = []

for x in range(G.shape[0]):
    for y in range(G.shape[1]):
        for z in range(G.shape[2]):
            for t in range(G.shape[3]):
                stat_ghi.append(G[x,y,z,t])
                if type(G[x,y,z,t]) != np.float64:
                    total_ghi+=1
total_ghi

0

In [23]:
total_cls = 0
stat_cls = []

for x in range(C.shape[0]):
    for y in range(C.shape[1]):
        for z in range(C.shape[2]):
            for t in range (C.shape[3]):
                stat_cls.append(C[x,y,z,t])
                if type(C[x,y,z,t]) != np.float64:
                    total_cls+=1
total_cls

0

In [24]:
total_sza = 0
stat_sza = []

for x in range(Z.shape[0]):
    for y in range(Z.shape[1]):
        for z in range(Z.shape[2]):
            for t in range(Z.shape[3]):
                stat_sza.append(Z[x,y,z,t])
                if type(Z[x,y,z,t]) != np.float64:
                    total_sza+=1
total_sza

0

In [25]:
total_saa = 0
stat_saa = []

for x in range(A.shape[0]):
    for y in range(A.shape[1]):
        for z in range(A.shape[2]):
            for t in range(A.shape[3]):
                stat_saa.append(A[x,y,z,t])
                if type(A[x,y,z,t]) != np.float64:
                    total_saa+=1
total_saa

0

In [26]:
# Create feature dataframes

df_ghi = pd.DataFrame(stat_ghi, columns=['GHI'])
df_cls = pd.DataFrame(stat_cls, columns=['CLS'])
df_sza = pd.DataFrame(stat_sza, columns=['SZA'])
df_saa = pd.DataFrame(stat_saa, columns=['SAA'])

In [27]:
len(df_ghi)

7873200

In [28]:
len(df_cls)

15746400

In [29]:
df_ghi.head(2)

Unnamed: 0,GHI
0,88.29
1,88.66


In [30]:
df_cls.head(2)

Unnamed: 0,CLS
0,99.87
1,100.27


In [36]:
# Create lagged dataset

# values_ghi = pd.Series(df_ghi.values)
lagged_ghi = pd.concat([df_ghi.shift(1), df_ghi], axis=1)
lagged_ghi.columns = ['t-1', 't+1']
lagged_ghi.head(5)

Unnamed: 0,t-1,t+1
0,,88.29
1,88.29,88.66
2,88.66,89.0
3,89.0,89.38
4,89.38,89.68


In [107]:
from sklearn.model_selection import train_test_split

# Split into train and test sets

X_train = np.load('../raw_data/X_train_copernicus.npz', allow_pickle = True)
X_test = np.load('../raw_data/X_test_copernicus.npz')
y_train = np.genfromtxt('../raw_data/y_train_zRvpCeO_nQsYtKN.csv')

In [105]:
X_train.files

['datetime', 'GHI', 'CLS', 'SZA', 'SAA']

In [13]:
print(np.shape(X_train['CLS']))
print(y_train.shape)

(1845, 8, 81, 81)
(1846,)


In [14]:
51 * 51 * 4

10404

In [10]:
df = pd.read_csv('../raw_data/y_train_zRvpCeO_nQsYtKN.csv')
df.head()

Unnamed: 0,id_sequence,0,1,2,3,4,5,6,7,8,...,10394,10395,10396,10397,10398,10399,10400,10401,10402,10403
0,0,247.51,261.68,260.16,262.71,264.28,262.55,257.21,228.88,230.02,...,379.39,379.73,380.07,380.4,380.74,381.08,381.41,381.75,382.08,382.42
1,1,471.37,476.41,474.55,477.96,479.87,477.65,476.65,482.58,484.8,...,501.29,501.67,502.06,502.45,502.83,503.22,503.6,503.98,504.37,504.75
2,2,458.39,463.38,461.54,464.88,466.76,464.57,463.57,469.42,471.61,...,386.68,387.06,387.43,387.8,388.18,388.55,388.92,389.3,389.67,390.04
3,3,223.42,228.37,226.91,229.26,230.75,229.11,228.31,232.8,234.44,...,92.52,92.76,93.0,93.24,93.48,93.73,93.97,94.21,94.45,94.69
4,4,170.23,137.61,136.96,84.42,54.23,53.53,50.55,70.92,74.24,...,382.01,382.16,382.3,382.44,382.59,382.73,382.88,383.03,383.17,383.32


In [19]:
y_train_df = get_4D_output(df)
y_train_df.shape

(1845, 4, 51, 51)

In [53]:
A = np.array([[0, 1, 2], [0, 2, 0]])
newrow = [1, 2, 3]
A = np.vstack([A, newrow])
print(np.shape(A))
print(np.shape(newrow))

(3, 3)
(3,)


In [59]:
print(X_train['GHI'][0].shape)
print(X_train['GHI'][0][3].shape)

(4, 81, 81)
(81, 81)


In [77]:
# We want to add a layer that is identical to the previous layer
# X_train['GHI'][0].shape = 4, 81, 81
# X_train['GHI'][0].shape = 8, 81, 81 with 5 - 8 being identical to 4

#Create temporary variable that will equal new layer
_ = np.expand_dims(X_train['GHI'][0][3], axis = 0)

# Checking structure of where we are
print(np.shape(X_train['GHI'][0]))
print(np.shape(_))

# Stacking new layer onto original array
_new_layer = np.vstack((X_train['GHI'][0], _))

# Double checking that new layer was added
print(np.shape(_new_layer))
print(np.shape(_new_layer[0]))

# Checking the contents
# print(_new_layer[2] - _new_layer[4])

(4, 81, 81)
(1, 81, 81)
(5, 81, 81)
(81, 81)


In [114]:
# Create a function to get 8 timestamps for each observation

def add_dumb_layers(observation):
    '''Takes a timestamp and duplicates four times for comeplete GHI datasets''' 
    
    new_layer = np.expand_dims(X_train['GHI'][observation][3], axis = 0)
    z = X_train['GHI'][observation]

    # print(z.shape)

    for i in range(4):
        z = np.vstack((z, new_layer))
        # print(z.shape)
    return z[4:8, :, :]

In [115]:
z = add_dumb_layers(0)
z.shape

(4, 81, 81)

In [116]:
X_train.files

['datetime', 'GHI', 'CLS', 'SZA', 'SAA']

In [122]:
# Build a for loop for the total number of observations

y_train_pred = np.zeros((1845,4,81,81))

for observation in range(len(X_train['datetime'])):
    y_train_pred[observation] = add_dumb_layers(observation)

np.shape(y_train_pred)

(1845, 4, 81, 81)

In [113]:
np.shape(X_train['GHI'])

(1845, 4, 81, 81)

In [21]:
# from sklearn.metrics import mean_squared_error

# def model_persistence(X):
#     return X

# # Validation

# predictions = list()

# for x in X_test:
# 	y_hat = model_persistence(x)
# 	predictions.append(y_hat)
 
# test_score = mean_squared_error(y_train_df, predictions)
# print('Test MSE: %.3f' % test_score)