# Visualize I and O tensors

In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
# from torchvision.utils import make_grid
import pandas as pd

from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (12,4)
# matplotlib.rcParams['figure.facecolor'] = '#ffffff'

# Logging ML
from torch.utils.tensorboard import SummaryWriter
import wandb

In [2]:
config = dict(    
    learning_rate =1e-3,
    # batch_size =128,
    epochs = int(3e3),
    model="nn3",
    layers = [256,128,128,64],
    training_loss = "MSE+tP",
    alpha= 1e-6,
    # scheduler = "one-cycle-lr",
    years=["2011","2013","2014"],
    years_val=["2012"],
    nodes="37"
)
use_tb = False
use_wandb= True
manual_logging = False
check_data_with_plots = False

project_name = f"phd-ph5x-02-power_prediction_{config['nodes']}"

random_seed = 746435
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7fd785cd9c30>

In [3]:
dir_root = Path("../") # go to root of git repo
dir_data = dir_root / "data"
dir_data_ml= dir_data /"ml"
dir_models = dir_root / "models"
dir_runs = dir_root/"runs"
dir_runs_tb = dir_runs /"tb"
dir_runs_wandb = dir_root / "wandb"
param_save = "002_01_simplest"

network_name = f"elec_s_{config['nodes']}_ec_lcopt_Co2L-3H"

dir_training_set = [dir_data_ml / y / "3M" for y in config["years"]]
filenames_inputs_tr = [d / f"{network_name}_inputs.P" for d in dir_training_set]
filenames_outputs_tr = [d / f"{network_name}_outputs_p.P" for d in dir_training_set]

dir_val_set = [ dir_data_ml / y/ "3M" for y in  config["years_val"]]
filenames_inputs_val = [d / f"{network_name}_inputs.P" for d in dir_val_set]
filenames_outputs_val = [d / f"{network_name}_outputs_p.P" for d in dir_val_set]

for fn in [*filenames_inputs_tr, *filenames_outputs_tr,
           *filenames_inputs_val,*filenames_outputs_val]:
    if not fn.exists():
        print(f"{fn}: Missing")
print("Otherwise all files present")

Otherwise all files present


### Load data

In [4]:
def read_all_dfs(filenames):
    return pd.concat([pd.read_pickle(f) for f in filenames])

df_input_tr = read_all_dfs(filenames_inputs_tr)
df_output_tr = read_all_dfs(filenames_outputs_tr)
df_input_val = read_all_dfs(filenames_inputs_val)
df_output_val = read_all_dfs(filenames_outputs_val)


assert (df_input_val.columns==df_input_tr.columns).all(), "Mismatch in input columns"
assert (df_output_val.columns==df_output_tr.columns).all(), "Mismatch in output columns"
input_features = df_input_val.columns
output_features = df_output_val.columns

x_train = torch.from_numpy(df_input_tr.values.astype("float32"))
y_train = torch.from_numpy(df_output_tr.values.astype("float32"))
x_val = torch.from_numpy(df_input_val.values.astype("float32"))
y_val = torch.from_numpy(df_output_val.values.astype("float32"))

n_input = x_train.shape[1]
n_output = y_train.shape[1]
n_samples_tr = x_train.shape[0]
n_samples_val = x_val.shape[0]

# Normalization defined by training data
x_mean = x_train.mean(dim = 0)
x_std =x_train.std(dim = 0)
y_mean = torch.zeros(n_output)  # centered already
y_std = y_train.std(dim = 0)

def x_norm(x): return (x-x_mean)/x_std
def y_norm(y): return (y-y_mean)/y_std
def x_renorm(x): return x*x_std+x_mean
def y_renorm(y): return y*y_std+y_mean


x_train =  x_norm(x_train)
y_train = y_norm(y_train)

x_val = x_norm(x_val)
y_val = y_norm(y_val)
y_renorm(y_train).sum(dim=1)
assert not(((x_val[0:100]-x_train[0:100])<1e-5).all()), "Training data identical to validation data"


(n_input,n_output,n_samples_tr,n_samples_val)
# train_loader = load

(219, 37, 8760, 2928)

In [76]:
# Sort generator technologies
col= df_input_tr.columns
sorted_gen =col[col.str.startswith("generators")].str.split(" ").str[3].argsort()
ngen =col.str.startswith("gen").sum()
index_sorted_technologies = np.concatenate([sorted_gen,list(range(ngen,len(col)))])

In [77]:
index_sorted_technologies 

array([  0,  94,  47,  89,  84,  25,  76,  98,  71,  66,  61,  34,  56,
        38,  51,  30, 106,  42, 111,  14, 117, 148, 121,   9, 126, 143,
       131, 135,  99,  48, 144,  52,  57, 136, 127, 107,  67,  72, 122,
        85, 118,  62,  90,  77,  26,  10,  31,  39,  15,  35,  43,  81,
       140,  86,  22, 137,  91,   4,  95,  19,  44,  78,  16, 108, 132,
       112, 115, 119, 128,  11, 123, 100, 103,   7,   1,  32,  63,  58,
       145,  36,  68,  53,  73, 149,  40,  27,  49, 152,  54, 109,  59,
       113,  17, 133, 104, 150, 101,  74,  96,   5,  45, 153,  28,  79,
       141,  82,  69, 124,  23,  87, 138,   2,  64,  92,  20, 129, 146,
        12, 151, 147,   6, 134,   3, 142, 130, 139,  18, 125,  46,  41,
        50,  37,  55,  60,  33,  65,  29,  70,  75,  80,  83,  24,  88,
        21,  93,  97, 102, 105, 110, 114, 116,  13, 120,   8, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [None]:
import matplotlib.cm as cm
import matplotlib.gridspec as grd


N=3000
x=x_train[3000:3000+N]
plt.rcParams['figure.dpi'] = 300
# N=pca.n_components_


fig = plt.figure(figsize=(14,9))
# create a 1 X 2 grid 
gs = grd.GridSpec(1, 2, width_ratios=[20,1])#, wspace=0.1)

# image plot
ax = [plt.subplot(g) for g in gs]


In [80]:


input_tech =  col[df_input_tr.columns.argsort()][index_sorted_technologies].str.split(" ").str[-1]
input_tech


Index(['offwind-ac', 'offwind-ac', 'offwind-ac', 'offwind-ac', 'offwind-ac',
       'offwind-ac', 'offwind-ac', 'offwind-ac', 'offwind-ac', 'offwind-ac',
       ...
       'hydro', 'hydro', 'hydro', 'hydro', 'hydro', 'hydro', 'hydro', 'hydro',
       'hydro', 'hydro'],
      dtype='object', length=219)

In [None]:
a= col[df_input_tr.columns.argsort()][index_sorted_technologies].str.split(" ").str[-1].value_counts(sort=False).values
x_ticks_major = a.cumsum()
x_ticks_minor = x_ticks_major - a/2

x_labels_major= input_tech.to_series().replace({'0':'load',"hydro":"hydro:inflow"}).unique()
# b = pd.DataFrame(index_outputs).groupby(0)[2].sum()[x_labels_major].values
# b[0]-=3
# x_ticks_major = b.cumsum()
# x_labels_major = [ f"{x[:-2]}:{y} ⇥"  for x,y,_ in index_outputs] #⇥

# fig,ax = plt.subplots(1,2,gridspec_kw=dict(width_ratios=[9,1]),figsize=(10,9),sharey='all')
cmap =  cm.magma
cmap.set_bad('k',1.)
ax[0].imshow(x[:,df_input_tr.columns.argsort()][:,index_sorted_technologies].T, cmap=cmap, aspect='auto')
ax[0].xaxis.tick_top()
ax[0].set_xticks(x_ticks_minor, minor=True)
ax[0].set_xticklabels(a, minor=True,size=8)

ax[0].set_xticks(x_ticks_major, minor=False)
ax[0].set_xticklabels(x_labels_major, minor=False,rotation=-20)
for label in ax[0].get_xticklabels():
    label.set_horizontalalignment('right')
ax[0].set_ylabel("PCA components\n(most prominent at the top)")
ax[0].tick_params('x', length=5, width=2, which='major')
ax[0].tick_params('x', length=0, width=1, which='minor')


# ax[1].fill_betweenx(np.arange(N),1e-33,pca.explained_variance_ratio_[:N],
#               color ='xkcd:royal purple')
# ax[1].set_xlim(1e-33,1e0)
# ax[1].set_xscale('log')
# ax[1].set_xticks([1e-33,1e0])
# ax[1].yaxis.tick_right()
# ax[1].set_ylim(0,N)
# ax[1].invert_yaxis()
# ax[1].set_ylabel('Variance ratio (log)')
# ax[1].yaxis.set_label_position("right")
fig.tight_layout()
# fig.savefig("outputs_PCA_all.png")


In [None]:
if check_data_with_plots:
    n_min=min(n_samples_val,n_samples_tr)
    _=plt.plot(x_train[:n_min], "r", alpha = 0.1) # [:,38:192]
    _=plt.plot(y_train[:n_min], "b", alpha = 0.1) # [:,38:192]
    _=plt.plot(x_val[:n_min], "m", alpha = 0.1) # [:,38:192]
    _=plt.plot(y_val[:n_min], "c", alpha = 0.1) # [:,38:192]
