# Functions

## Imports

In [2]:
from pathlib import Path
import os
import re

from datetime import datetime

from math import sqrt

import pandas as pd
from pandas import DataFrame , concat
import numpy as np
from numpy import mean , concatenate
from numpy import array , hstack
from math import sqrt

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import figure as fig
from matplotlib.pylab import rcParams
import plotly.express as px
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import pmdarima as pm


from sklearn import decomposition
from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Activation, Dropout, Bidirectional, TimeDistributed

from keras.wrappers.scikit_learn import KerasRegressor

## Files & Directories

In [None]:
def get_files(directory, pattern = '*', recursive = True):
    files = []
    for obj in directory.iterdir():
        if obj.is_file() & obj.match(pattern):
            files.append(obj)
        if obj.is_dir() & recursive:
            files.extend(get_files(obj, pattern, recursive))
    return sorted(files)

In [None]:
def read_files(filedata):
    li = []
    for filename in filedata:
        df = pd.read_csv(filename, parse_dates=['timestamp'], dayfirst=True, delimiter=';', decimal=',').groupby([pd.Grouper(freq='1H', key='timestamp')]).kw.mean().reset_index()
        li.append(df)
    frame = pd.concat(li, axis=0, ignore_index=True)
    return frame

In [None]:
def clean_c_names(df):
    old_cols = list(df.columns.values)
    remove = re.compile(r'\s[\(\[].*?[\)\]]')
    remove2 = re.compile(r'[0-9]*\.[0-9]+°[a-zA-Z]\s')
    wspace = re.compile(r'\s+')
    new_cols = []
    for i in old_cols:
        i = re.sub(remove, "", i)
        i = re.sub(remove2, "", i)
        i = re.sub(wspace, "_", i)
        new_cols.append(i)
    df.columns = new_cols
    return df

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [None]:
def printInfo(dataframe):
  print("Rows     :", dataframe.shape[0])
  print("Columns  :", dataframe.shape[1])
  print("\n Features \n", dataframe.columns.to_list())
  print("\n Missing Values \n", dataframe.isnull().any())
  print("\n Unique Values \n", dataframe.nunique())

In [None]:
def test_stationarity(timeseries, window, cutoff):

    #Determing rolling statistics
    rolmean = timeseries.rolling(window).mean()
    rolstd = timeseries.rolling(window).std()

    #Plot rolling statistics:
    fig = plt.figure(figsize=(20, 10))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC', maxlag = 20 )
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    pvalue = dftest[1]
    if pvalue < cutoff:
        print('p-value = %.4f. The series is likely stationary.' % pvalue)
    else:
        print('p-value = %.4f. The series is likely non-stationary.' % pvalue)
    
    print(dfoutput)

In [None]:
def check_stationarity(y, lags_plots, figsize=(22,8)):
    "Use Series as parameter"
    
    # Creating plots of the DF
    y = pd.Series(y)
    fig = plt.figure()

    ax1 = plt.subplot2grid((3, 3), (0, 0), colspan=2)
    ax2 = plt.subplot2grid((3, 3), (1, 0))
    ax3 = plt.subplot2grid((3, 3), (1, 1))
    ax4 = plt.subplot2grid((3, 3), (2, 0), colspan=2)

    y.plot(ax=ax1, figsize=figsize)
    ax1.set_title('Power Plant kw-Variation')
    plot_acf(y, lags=lags_plots, zero=False, ax=ax2)
    plot_pacf(y, lags=lags_plots, zero=False, ax=ax3)
    sns.distplot(y, bins=int(sqrt(len(y))), ax=ax4)
    ax4.set_title('Distribution Chart')

    plt.tight_layout()
    
    print('Results of Dickey-Fuller Test:')
    adfinput = adfuller(y)
    adftest = pd.Series(adfinput[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used'])
    adftest = round(adftest,4)
    
    for key, value in adfinput[4].items():
        adftest["Critical Value (%s)"%key] = value.round(4)
        
    print(adftest)
    
    if adftest[0].round(2) < adftest[5].round(2):
        print('\nThe Test Statistics is lower than the Critical Value of 5%.\nThe serie seems to be stationary')
    else:
        print("\nThe Test Statistics is higher than the Critical Value of 5%.\nThe serie isn't stationary")