In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as mlt
import seaborn as sp
from torch.autograd import Variable
from torch import autograd
from datetime import datetime
import matplotlib.pyplot as plt
import argparse
from sklearn.decomposition import PCA
from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler

In [5]:
class PrepareDatForInput():
    def __init__(self, model_parameter):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_parameter = model_parameter
    
    def filter_data_by_lag_size(self, dataset, lag):
        X_purified = []
        for i in range(len(dataset)):
            if (len(dataset[i]) == lag):
                X_purified.append(dataset[i])
        return X_purified
    
    def input_transform(self, real, lag, future_step = 0):
        X_real = []
        for i in range(len(real) - lag - future_step):
            lag_data = []
            for j in range(i, i+lag):
                lag_data.append(real[j])
            X_real.append(lag_data)
                
        X_real = np.stack(self.filter_data_by_lag_size(X_real, lag))

        return X_real
    
    def train_test_split(self, X, train_ratio):
        X_train = X[0: int(len(X) * train_ratio)]
        X_test = X[int(len(X) * train_ratio): len(X)]

        return X_train, X_test
    
    def get_forecasting_ground_truth_data(self, load, window, num):
        train_label = []

        for i in range(0, len(load) - window - num):
            lag_data = []
            for j in range(i + window, i + window + num):
                lag_data.append(load[j])
            train_label.append(lag_data)
        train_label = np.stack(train_label)
        return train_label
    
    def get_train_test_dataset_forecasting(self, df, train_test_ratio):
        real = df
        data = self.input_transform(real, self.model_parameter.lag_window, self.model_parameter.future_step)
        ground_truth = self.get_forecasting_ground_truth_data(real, self.model_parameter.lag_window, self.model_parameter.future_step)
        data_train, data_test = self.train_test_split(data, train_test_ratio)
        ground_truth_train, ground_truth_test = self.train_test_split(ground_truth, train_test_ratio)

        return data_train, data_test, ground_truth_train, ground_truth_test

In [11]:
class GetPcaTransformedData():
    def __init__(self):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def transform_data(self, df, columns):
        scaler = MinMaxScaler(feature_range=(0,1))
        df = scaler.fit_transform(df)
        
        self.visualize_pca_components(df)

        pca = PCA(n_components=5) 
        pca.fit(df)

        eigenvalues = pca.explained_variance_

        # Get eigenvectors
        eigenvectors = pca.components_
        print(eigenvalues)
        print(eigenvectors)
        pca_transformed_data = pca.transform(df)

        df = pd.DataFrame(df, columns=columns)
        df_transformed = pd.DataFrame(data=pca_transformed_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
        df_final = pd.concat([df_transformed, df['Energy']], axis=1)

        return df_final

    def visualize_pca_components(self, df):
        pca = PCA()
        pca.fit(df)

        # Pareto Plot
        plt.figure(figsize=(10, 6))
        explained_variance_ratio = pca.explained_variance_ratio_
        cumulative_explained_variance = np.cumsum(explained_variance_ratio)
        plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.5, align='center', label='Individual Explained Variance')
        plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', color='r', label='Cumulative Explained Variance')
        plt.xlabel('Principal Components')
        plt.ylabel('Explained Variance Ratio')
        plt.title('Pareto Plot')
        plt.grid(True)
        for i, txt in enumerate(np.round(explained_variance_ratio, 3)):
            plt.annotate(txt, (i+1, explained_variance_ratio[i]), fontsize=8, ha='center', va='bottom')
        for i, txt in enumerate(np.round(cumulative_explained_variance, 3)):
            plt.annotate(txt, (i+1, cumulative_explained_variance[i]), fontsize=8, ha='center', va='bottom')
        plt.legend()
        plt.savefig('../Results/pca_analysis/pareto.png')
        #plt.show()

        # Scree Plot
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(explained_variance_ratio) + 1), pca.explained_variance_ratio_, marker='o')
        plt.xlabel('Principal Components')
        plt.ylabel('Explained Variance Ratio')
        plt.title('Scree Plot')
        plt.grid(True)
        for i, txt in enumerate(np.round(explained_variance_ratio, 3)):
            plt.annotate(txt, (i+1, explained_variance_ratio[i]), fontsize=8, ha='center', va='bottom')
        plt.savefig('../Results/pca_analysis/scree.png')
        #plt.show()