This notebook contains the experimental pipeline for temporal data of MS stations. The main goal is to train regression models based on the features of current and voltage. The prediction goal is the active and reactive power. The first half of the notebook contains pre-processing. Ranging from importing the code, removing excess features, converting date times to an interger time stamp, imputing NaNs, and adding lagged measurements for the current. 

In [15]:
import pyarrow.feather as feather
import pandas as pd
from datetime import datetime
import numpy as np
import math
import time

import sklearn.gaussian_process as gp
from sklearn.gaussian_process import GaussianProcessRegressor
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df = feather.read_feather('ems_metingen_pqi.feather')
df['DATUM_TIJD'] = pd.to_datetime(df['DATUM_TIJD'])

# preprocessing
# uncomment this part of the code if you want to add the in_uitgaand features to the ems dataset

# df_excel = pd.read_excel("ems_metingen_in_uitgaand.xlsx")
# remove_lst = [3,4,5]
# df_excel = df_excel.drop(df_excel.columns[remove_lst], axis = 1)
# df_excel = df_excel.replace("Uit", 1)
# df_excel = df_excel.replace("In", -1)
# df_excel = df_excel.replace("nvt", 0)
# df_excel = df_excel.replace("defect", np.NAN)
# df = pd.merge(df, df_excel, on=['TA_B1_NAME', 'TA_B2_NAME', 'TA_B3_NAME'], how='left')


# Dropping columns
remove_list = [0,1,3,5,6,7,8,10,11,12,13,14,15,16]
for index in remove_list:
    print('column removed ->', df.columns[index])
df = df.drop(df.columns[remove_list], axis=1)

column removed -> ROUTE_ID
column removed -> ROUTE_NAAM
column removed -> TA_B2_NAME
column removed -> M_POINT_P
column removed -> M_POINT_Q
column removed -> M_POINT_I
column removed -> M_TIMESTAMP
column removed -> FLAG_P
column removed -> FLAG_Q
column removed -> FLAG_I
column removed -> FLAG_MEETFOUT
column removed -> FLAG_SCHAKEL_EVENT
column removed -> SCHEMA_MS_VELD_ID
column removed -> UPDATE_DATUMTIJD


In [13]:
temp_df = df[df["TA_B1_NAME"]=="HrvH"]
temp_df = temp_df[temp_df["DATUM_TIJD"]=="2021-05-9 16:50:00"]
lst = temp_df["M_VALUE_I"].to_list()
print(df["TA_B1_NAME"].unique())
lst = [abs(x) for x in lst]
print(sum(lst))
temp_df

['HrvH' 'Nk' 'Grd' 'Dtn' 'HFDP' 'Dvd-RS' 'Ns' 'Tex' 'Lw' 'Hby']
463.96000000000004


Unnamed: 0,TA_B1_NAME,TA_B3_NAME,DATUM_TIJD,M_VALUE_P,M_VALUE_Q,M_VALUE_I,BEDRIJFSSPANNING
3990289,HrvH,V103,2021-05-09 16:50:00,0.0,0.0,0.0,0.021
4962775,HrvH,V104,2021-05-09 16:50:00,0.0,0.0,0.0,0.021
5769635,HrvH,V105,2021-05-09 16:50:00,-2.83,-0.4,78.51,0.021
5914731,HrvH,V108,2021-05-09 16:50:00,-2.83,-0.4,80.1,0.021
6562145,HrvH,V101,2021-05-09 16:50:00,0.0,0.0,0.0,0.021
6693795,HrvH,V107,2021-05-09 16:50:00,0.0,0.0,0.0,0.021
7025049,HrvH,V111,2021-05-09 16:50:00,-2.82,-0.4,78.08,0.021
7687412,HrvH,V110,2021-05-09 16:50:00,2.52,0.4,71.53,0.021
8279185,HrvH,V109,2021-05-09 16:50:00,1.55,0.26,42.79,0.021
8320851,HrvH,V112,2021-05-09 16:50:00,1.55,0.27,43.74,0.021


In [32]:
''' Calculates current from P, Q, and U'''
def calculate_I(P,Q,U):
    return np.sqrt(pow(P,2) + pow(Q,2)/(U*np.sqrt(3)))

''' boolean to check if only I is NaN '''
def only_I_is_nan(P, Q, I):
    value = (not np.isnan(P) and not np.isnan(Q)) and np.isnan(I)
    return value

''' 
    fills the NaN values in the dataframe.
    First compute I where able (If P, Q, and U are known),
    then impute the rest of the NaNs with given means for P, Q, and I
'''
def fill_nan(temp_df, mean_P, mean_Q, mean_I):
    # Calculate I from P and Q only if I is NaN and P and Q are not NaNs
    temp_df["M_VALUE_I"] = temp_df.apply(lambda x: calculate_I(x["M_VALUE_P"], x["M_VALUE_Q"], x["BEDRIJFSSPANNING"]) if only_I_is_nan(x["M_VALUE_P"], x["M_VALUE_Q"], x["M_VALUE_I"]) else x["M_VALUE_I"], axis=1)

    temp_df["M_VALUE_P"] = temp_df["M_VALUE_P"].fillna(value= mean_P)
    temp_df["M_VALUE_Q"] = temp_df["M_VALUE_Q"].fillna(value= mean_Q)
    temp_df["M_VALUE_I"] = temp_df["M_VALUE_I"].fillna(value= mean_I)
    return temp_df

In [33]:
''' converts a time amount in 'nanoseconds' to an amount in '5 minutes ''' 
def ns_to_5m(x):
    return x/(pow(10,9)*60*5)

In [34]:
df = df.rename(columns={"TA_B1_NAME": "STATION"})
df = df.rename(columns={"TA_B3_NAME": "FIELD"})

# sort values on "DATUM_TIJD"
df = df.sort_values("DATUM_TIJD")

# .value returns time in nanoseconds, starting form unix time.
# Get starting timestamp and convert this to '5 minutes'
start_time = ns_to_5m(df["DATUM_TIJD"].iloc[1].value)

# Convert each DATETIME timestamp to a float value representing the amount of 5 minutes since start time
df["DATUM_TIJD"] = df['DATUM_TIJD'].apply(lambda x: ns_to_5m(x.value)-start_time)

df = df.reset_index(drop=True)

In [35]:
df

Unnamed: 0,STATION,FIELD,DATUM_TIJD,M_VALUE_P,M_VALUE_Q,M_VALUE_I,BEDRIJFSSPANNING
0,Tex,Tr1,0.0,0.00,0.00,,0.0105
1,Nk,2.11,0.0,0.00,0.00,0.56,0.0105
2,Dvd-RS,V309,0.0,0.92,0.53,58.63,0.0105
3,Dvd-RS,V301,0.0,0.23,0.01,12.10,0.0105
4,Dvd-RS,V307,0.0,0.27,0.02,14.37,0.0105
...,...,...,...,...,...,...,...
14023790,Ns,TR1,105119.0,-8.14,1.96,1026.41,0.0105
14023791,Nk,2.07,105119.0,0.00,0.00,0.16,0.0105
14023792,Nk,2.08,105119.0,0.29,-0.10,18.38,0.0105
14023793,HFDP,V301,105119.0,0.01,-0.23,0.00,0.0210


In [22]:
mean_P = df["M_VALUE_P"].mean()
mean_Q = df["M_VALUE_Q"].mean()
mean_I = df["M_VALUE_I"].mean()
df = fill_nan(df, mean_P, mean_Q, mean_I)

In [23]:
df

Unnamed: 0,STATION,FIELD,DATUM_TIJD,M_VALUE_P,M_VALUE_Q,M_VALUE_I,BEDRIJFSSPANNING
0,Tex,Tr1,0.0,0.00,0.00,0.00,0.0105
1,Nk,2.11,0.0,0.00,0.00,0.56,0.0105
2,Dvd-RS,V309,0.0,0.92,0.53,58.63,0.0105
3,Dvd-RS,V301,0.0,0.23,0.01,12.10,0.0105
4,Dvd-RS,V307,0.0,0.27,0.02,14.37,0.0105
...,...,...,...,...,...,...,...
14023790,Ns,TR1,105119.0,-8.14,1.96,1026.41,0.0105
14023791,Nk,2.07,105119.0,0.00,0.00,0.16,0.0105
14023792,Nk,2.08,105119.0,0.29,-0.10,18.38,0.0105
14023793,HFDP,V301,105119.0,0.01,-0.23,0.00,0.0210


In [24]:
# df = df[df["IN_UITGAAND"].notna()]

# df_x = df[["STATION", "FIELD", "DATUM_TIJD", "BEDRIJFSSPANNING", "IN_UITGAAND", "M_VALUE_I"]]
df_x = df[["STATION", "FIELD", "DATUM_TIJD", "BEDRIJFSSPANNING", "M_VALUE_I"]]
df_y = df[["STATION", "FIELD", "DATUM_TIJD", "M_VALUE_P", "M_VALUE_Q"]]

In [25]:
''' 
    df: pandas dataframe, amount_of_lag: int
    Adds amount_of_lag lagged columns to the dataframe df,
    such that each lagged column holds the field values from a previous timestamp.
    It is possible to create leading columns instead of lagging columns by passing
    amount_of_lag a negative value.

'''
def add_lag(df, amount_of_lag):
    lst = []
    
    for station in df["STATION"].unique():
        temp_station_df = df[df["STATION"] == station].copy(deep=True)
        
        for field in temp_station_df["FIELD"].unique():
            temp_field_df = temp_station_df[temp_station_df["FIELD"] == field].copy(deep=True)
            
            for lag in range(1,amount_of_lag):
                temp_field_df["lag_%s" % (lag)] = temp_field_df["M_VALUE_I"].shift(lag)

            lst.append(temp_field_df.copy(deep=True))
            lagged_df = pd.concat(lst)
           
    return lagged_df


df_x = add_lag(df_x, 12)

In [26]:
df_x

Unnamed: 0,STATION,FIELD,DATUM_TIJD,BEDRIJFSSPANNING,M_VALUE_I,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11
0,Tex,Tr1,0.0,0.0105,0.000000,,,,,,,,,,,
183,Tex,Tr1,1.0,0.0105,0.000000,0.000000,,,,,,,,,,
359,Tex,Tr1,2.0,0.0105,0.000000,0.000000,0.000000,,,,,,,,,
501,Tex,Tr1,3.0,0.0105,0.000000,0.000000,0.000000,0.000000,,,,,,,,
609,Tex,Tr1,4.0,0.0105,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14023194,Lw,INSTAL1,105115.0,0.0105,62.470628,61.484648,60.960535,60.323847,59.439029,58.776076,58.497215,57.222672,55.735906,57.342389,53.629411,54.951889
14023316,Lw,INSTAL1,105116.0,0.0105,63.444266,62.470628,61.484648,60.960535,60.323847,59.439029,58.776076,58.497215,57.222672,55.735906,57.342389,53.629411
14023463,Lw,INSTAL1,105117.0,0.0105,63.478481,63.444266,62.470628,61.484648,60.960535,60.323847,59.439029,58.776076,58.497215,57.222672,55.735906,57.342389
14023559,Lw,INSTAL1,105118.0,0.0105,62.741949,63.478481,63.444266,62.470628,61.484648,60.960535,60.323847,59.439029,58.776076,58.497215,57.222672,55.735906


In [27]:
# Calculates the dead space, or the amount of 0s present in the dataset
def calc_percentage_dead_space(Xs):
    dead_space = np.zeros(len(Xs))
    total = Xs[0].stack().value_counts().sum()
    for index, df in enumerate(Xs):
        zeros = df.stack().value_counts()[0]
        dead_space[index] = zeros/total
    return dead_space



# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller chunks
def split_dataframe(df, chunk_size, limit): 
    chunks = list()
    df = df[df["DATUM_TIJD"] <= limit]
    entries_per_5_min = df[df["DATUM_TIJD"]==0].shape[0]
    print(chunk_size)
    num_chunks = len(df) // (chunk_size*entries_per_5_min) + 1
    print(num_chunks)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks
    

# splits the data on stations, creating a data subset for each station
# amount of data taken from each station specified with weeks
# limit is the amount of samples desired per split. Useful for quick test runs
def split_on_stations(df_IU, df_PQ, weeks, limit=0):
    if limit == 0:
        limit = df_IU.shape[0]
    
    df_IU = df_IU[df_IU["DATUM_TIJD"] <= limit]
    df_PQ = df_PQ[df_PQ["DATUM_TIJD"] <= limit]
    
    df_IU = reduce_timescale(df_IU, weeks)
    df_PQ = reduce_timescale(df_PQ, weeks)
    
    stations = df_IU["STATION"].unique()

    dfs_IU = []
    dfs_PQ = []

    for station in stations:
        temp_IU = df_IU[df_IU["STATION"]==station]
        temp_PQ = df_PQ[df_PQ["STATION"]==station]

        del temp_PQ["DATUM_TIJD"]
        del temp_IU["STATION"], temp_PQ["STATION"]
        del temp_IU["FIELD"], temp_PQ["FIELD"]
        
        temp_PQ#.reset_index(drop = True, inplace = True)
        temp_IU#.reset_index(drop = True, inplace = True)

        dfs_IU.append(temp_IU)
        dfs_PQ.append(temp_PQ)
    
    return dfs_IU, dfs_PQ


# splits the data in equal parts of depending on the desired amount of weeks
# limit is the amount of samples desired per split. Useful for quick test runs
def split_on_weeks(df_IU, df_PQ, weeks, limit=0):   
    if limit == 0:
        limit = df_IU.shape[0]
    
    dfs_IU = split_dataframe(df_IU, weeks_to_5_mins(weeks), limit)
    dfs_PQ = split_dataframe(df_PQ, weeks_to_5_mins(weeks), limit)
    
    for index in range(len(dfs_IU)):
        temp_IU = dfs_IU[index]#.reset_index(drop=True)
        temp_PQ = dfs_PQ[index]#.reset_index(drop=True)
        
        del temp_PQ["DATUM_TIJD"]
        del temp_IU["STATION"], temp_PQ["STATION"]
        del temp_IU["FIELD"], temp_PQ["FIELD"]
        
        dfs_IU[index] = temp_IU
        dfs_PQ[index] = temp_PQ
    
    return dfs_IU, dfs_PQ
    

# performs cross validation on the split datasets of Xs and ys.
# Metrics preserved are sign accuracy and mse, which are returned in a 2D result matrix
def cross_validation(Xs, ys):
    matrix = np.zeros((len(ys),len(ys),2))
    alpha = 10
    for x_index, X_train in enumerate(Xs):
        y_train = ys[x_index]
        for y_index, y_test in enumerate(ys):
            print("X: {0}, Y: {1}".format(x_index, y_index))
            X_test = Xs[y_index]
            df_predict = GaussianProcess_regression(X_train, y_train, X_test)
            
            P,_ = predict_sign(df_predict, y_test)
            mse = mean_squared_error(y_test, df_predict)
            
            matrix[x_index, y_index, 0] = P
            matrix[x_index, y_index, 1] = mse
            
    return matrix

In [28]:
df_x.sort_index(inplace=True)
df_x.fillna(0, inplace=True)

In [29]:
# Reduce timescale

# calculates the amount of 5 minutes present in an amount of weeks
def weeks_to_5_mins(weeks):
    return 12*24*7*weeks

# reduce the timescale of the data set to be less than the specified amount of weeks
def reduce_timescale(df, weeks):
    weeks = weeks_to_5_mins(weeks)
    df = df[df["DATUM_TIJD"] <= weeks]
    return df

In [30]:
# alpha = 10
def GaussianProcess_regression(X_train, y_train, X_test, alpha=10):
    model = GaussianProcessRegressor(alpha = alpha)
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_train.columns, dtype = float)
    return df_predict


def XGboost_regression(X_train, y_train, X_test):
    model = XGBRegressor(objective='reg:squarederror')
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_train.columns, dtype = float)
    return df_predict


In [31]:
'''
    df_predict: pandas dataframe, y_test: pandas dataframe
    Compares the dataframe of predictions with the dataframe containing the true
    values. Computes the accuracy based sign prediction (+ or -) of P and Q and returns it.
'''

def predict_sign(df_predict, y_test):
    df_sign = df_predict.copy()
    df_y_sign = y_test.copy()
    for col in df_sign.columns:
        df_sign[col] = df_sign[col].apply(lambda x: -1 if x<0 else 1)
        df_y_sign[col] = df_y_sign[col].apply(lambda x: -1 if x<0 else 1)

    df_new_sign = df_sign == df_y_sign
    
    field_accuracies = []
    for col in df_new_sign.columns:
        field_accuracies.append(df_new_sign[col].value_counts(normalize=True).values[0])

    P_accuracies = field_accuracies[::2]
    Q_accuracies = field_accuracies[1::2]
    
    P_avg = sum(P_accuracies)/len(P_accuracies)
    Q_avg = sum(Q_accuracies)/len(Q_accuracies)
    
    return P_avg, Q_avg

## Cross validation

cross validation is done by selecting the train-test split (either split_on_weeks or split_on_stations) and then within the function cross_validation to change your machine learning model (either XGboost_regression or GaussianProcess_regression). This will perform cross validation and save the results of each train-test combination in a result matrix.

In [23]:
weeks = 2
st = time.time()
limit = 0

Xs, ys = split_on_weeks(df_x, df_y, weeks, limit)
matrix = cross_validation(Xs, ys)

et = time.time()
print("Time ran: ", et - st)


X: 0, Y: 0
X: 0, Y: 1
X: 0, Y: 2
X: 0, Y: 3
X: 0, Y: 4
X: 0, Y: 5
X: 0, Y: 6
X: 0, Y: 7
X: 0, Y: 8
X: 0, Y: 9
X: 1, Y: 0
X: 1, Y: 1
X: 1, Y: 2
X: 1, Y: 3
X: 1, Y: 4
X: 1, Y: 5
X: 1, Y: 6
X: 1, Y: 7
X: 1, Y: 8
X: 1, Y: 9
X: 2, Y: 0
X: 2, Y: 1
X: 2, Y: 2
X: 2, Y: 3
X: 2, Y: 4
X: 2, Y: 5
X: 2, Y: 6
X: 2, Y: 7
X: 2, Y: 8
X: 2, Y: 9
X: 3, Y: 0
X: 3, Y: 1
X: 3, Y: 2
X: 3, Y: 3
X: 3, Y: 4
X: 3, Y: 5
X: 3, Y: 6
X: 3, Y: 7
X: 3, Y: 8
X: 3, Y: 9
X: 4, Y: 0
X: 4, Y: 1
X: 4, Y: 2
X: 4, Y: 3
X: 4, Y: 4
X: 4, Y: 5
X: 4, Y: 6
X: 4, Y: 7
X: 4, Y: 8
X: 4, Y: 9
X: 5, Y: 0
X: 5, Y: 1
X: 5, Y: 2
X: 5, Y: 3
X: 5, Y: 4
X: 5, Y: 5
X: 5, Y: 6
X: 5, Y: 7
X: 5, Y: 8
X: 5, Y: 9
X: 6, Y: 0
X: 6, Y: 1
X: 6, Y: 2
X: 6, Y: 3
X: 6, Y: 4
X: 6, Y: 5
X: 6, Y: 6
X: 6, Y: 7
X: 6, Y: 8
X: 6, Y: 9
X: 7, Y: 0
X: 7, Y: 1
X: 7, Y: 2
X: 7, Y: 3
X: 7, Y: 4
X: 7, Y: 5
X: 7, Y: 6
X: 7, Y: 7
X: 7, Y: 8
X: 7, Y: 9
X: 8, Y: 0
X: 8, Y: 1
X: 8, Y: 2
X: 8, Y: 3
X: 8, Y: 4
X: 8, Y: 5
X: 8, Y: 6
X: 8, Y: 7
X: 8, Y: 8
X: 8, Y: 9
X: 9, Y: 0

In [24]:
# add column and row names for crossvalidation
lst = np.arange(matrix.shape[0])
cols = ["trained_on_" + str(number) for number in lst]
rows = ["tested_on_" + str(number) for number in lst]

pd_crossval_stations = pd.DataFrame(matrix[:,:,0].T, index = rows, columns = cols,dtype = float)
pd_crossval_stations

Unnamed: 0,trained_on_0,trained_on_1,trained_on_2,trained_on_3,trained_on_4,trained_on_5,trained_on_6,trained_on_7,trained_on_8,trained_on_9
tested_on_0,0.939356,0.738416,0.521516,0.748477,0.501089,0.874794,0.665017,0.796535,0.622153,0.95297
tested_on_1,0.689769,0.840891,0.846915,0.55198,0.740594,0.557756,0.662541,0.643564,0.733045,1.0
tested_on_2,0.686469,0.850891,0.939452,0.548743,0.756535,0.596122,0.666667,0.65297,0.741213,1.0
tested_on_3,0.705171,0.87901,0.863861,0.890137,0.668713,0.718028,0.729373,0.65,0.651238,1.0
tested_on_4,0.636139,0.663564,0.724105,0.692308,0.954455,0.799917,0.660066,0.561881,0.660644,1.0
tested_on_5,0.773927,0.898317,0.818355,0.697639,0.62604,0.934818,0.593234,0.674257,0.74505,0.95297
tested_on_6,0.606436,0.655545,0.629094,0.681835,0.53901,0.579414,0.997525,0.833168,0.510396,0.634901
tested_on_7,0.642739,0.833861,0.898324,0.734577,0.588614,0.873969,0.646865,1.0,0.781064,0.543317
tested_on_8,0.531903,0.602277,0.777609,0.7123,0.804455,0.801155,0.666667,0.730693,0.790223,0.795792
tested_on_9,0.777778,0.999208,0.923077,0.769231,0.639604,0.875825,0.666667,0.8,0.8,1.0


In [28]:
# metric 0 = sign accuracy, metric 1 = mse
'''
    matrix: np.array, metric: int, file_name: string
    Exports the specific metrics of the crossvalidation to a .feater file with.
    Matrix stores the validation results. Metric = 0 returns sign accuracy and metric = 1 the mse.
'''
def export_results(matrix, metric, file_name):
    # # add column and row names for crossvalidation
    lst = np.arange(matrix.shape[0])
    cols = ["trained_on_" + str(number) for number in lst]
    rows = ["tested_on_" + str(number) for number in lst]
    
    pd_crossval_stations = pd.DataFrame(matrix[:,:,metric].T, index = rows, columns = cols,dtype = float)
    dead_space = calc_percentage_dead_space(Xs)

    pd_crossval_stations["dead_space"] = dead_space
    pd_crossval_weeks = pd_crossval_stations.reset_index()
    pd_crossval_weeks.to_feather("results/"+file_name)

    return pd_crossval_weeks

In [26]:
file_name_accuracy = "gp_station_single_field_crossval_accuracy.feather"

export_results(matrix, 0, file_name_accuracy)

Unnamed: 0,index,trained_on_0,trained_on_1,trained_on_2,trained_on_3,trained_on_4,trained_on_5,trained_on_6,trained_on_7,trained_on_8,trained_on_9,dead_space
0,tested_on_0,0.939356,0.738416,0.521516,0.748477,0.501089,0.874794,0.665017,0.796535,0.622153,0.95297,0.152758
1,tested_on_1,0.689769,0.840891,0.846915,0.55198,0.740594,0.557756,0.662541,0.643564,0.733045,1.0,0.157365
2,tested_on_2,0.686469,0.850891,0.939452,0.548743,0.756535,0.596122,0.666667,0.65297,0.741213,1.0,0.087351
3,tested_on_3,0.705171,0.87901,0.863861,0.890137,0.668713,0.718028,0.729373,0.65,0.651238,1.0,0.290026
4,tested_on_4,0.636139,0.663564,0.724105,0.692308,0.954455,0.799917,0.660066,0.561881,0.660644,1.0,0.348637
5,tested_on_5,0.773927,0.898317,0.818355,0.697639,0.62604,0.934818,0.593234,0.674257,0.74505,0.95297,0.397631
6,tested_on_6,0.606436,0.655545,0.629094,0.681835,0.53901,0.579414,0.997525,0.833168,0.510396,0.634901,0.001974
7,tested_on_7,0.642739,0.833861,0.898324,0.734577,0.588614,0.873969,0.646865,1.0,0.781064,0.543317,0.050261
8,tested_on_8,0.531903,0.602277,0.777609,0.7123,0.804455,0.801155,0.666667,0.730693,0.790223,0.795792,0.201045
9,tested_on_9,0.777778,0.999208,0.923077,0.769231,0.639604,0.875825,0.666667,0.8,0.8,1.0,0.001316


In [27]:
file_name_mse = "gp_station_single_field_crossval_mse.feather"

export_results(matrix, 1, file_name_mse)

Unnamed: 0,index,trained_on_0,trained_on_1,trained_on_2,trained_on_3,trained_on_4,trained_on_5,trained_on_6,trained_on_7,trained_on_8,trained_on_9,dead_space
0,tested_on_0,2.43091,5.059704,1.486741,1.646578,2.339413,5.330394,93.630222,12.866848,10.674727,116.793449,0.152758
1,tested_on_1,3.284367,3.83089,1.485882,1.646626,2.382576,5.330394,93.769929,12.852976,10.677602,116.775249,0.157365
2,tested_on_2,3.285033,5.055241,1.207248,1.646636,2.353106,5.330394,93.623672,12.851954,10.671304,116.79445,0.087351
3,tested_on_3,3.283032,5.058764,1.479241,1.329651,2.351463,5.330456,93.659032,12.852924,10.671982,116.794582,0.290026
4,tested_on_4,3.261536,5.076645,1.48656,1.647019,1.778517,5.332614,93.644452,12.861403,10.678176,116.757658,0.348637
5,tested_on_5,3.283068,5.05877,1.479235,1.646675,2.352099,4.163205,93.622021,12.852919,10.671816,116.794582,0.397631
6,tested_on_6,3.284776,5.076019,1.479896,1.659445,2.359245,5.330394,69.41082,12.857916,10.672186,116.795411,0.001974
7,tested_on_7,3.287488,5.058834,1.478967,1.646633,2.354886,5.330394,93.63073,9.679976,10.631824,116.93803,0.050261
8,tested_on_8,3.284162,5.057208,1.47837,1.646983,2.353927,5.330579,93.623639,12.697833,7.73125,116.925779,0.201045
9,tested_on_9,3.282953,5.057611,1.479198,1.646636,2.353566,5.330394,93.622564,12.909884,10.685709,89.676537,0.001316


## Plotting

Plotting specific train-test combinations for display purposes.


In [40]:
def predict(train_on, test_on, weeks, split_on, df_x, df_y):
    limit = 0

    Xs, ys = split_on(df_x, df_y, weeks, limit)

    x_train = Xs[train_on]
    y_train = ys[train_on]
    x_test = Xs[test_on]
    y_test = ys[test_on]

#     df_xg_predict = XGboost_regression(x_train, y_train, x_test)
    df_gp_predict = GaussianProcess_regression(x_train, y_train, x_test)
    
    return df_xg_predict, df_gp_predict, y_test

df_xg_predict, df_gp_predict, df_true_P = predict(4, 0, 2, split_on_stations, df_x, df_y)

MemoryError: Unable to allocate 75.7 GiB for an array with shape (100800, 100800) and data type float64

In [38]:
P,_ = predict_sign(df_xg_predict, df_true_P)
mse = mean_squared_error(df_true_P, df_xg_predict)

print(P, mse)
df_xg_predict

0.6263227513227513 12.16585012768159


Unnamed: 0,M_VALUE_P,M_VALUE_Q
0,-0.022493,0.020927
1,-0.009423,-0.012755
2,0.408871,-0.612381
3,0.429829,0.032674
4,0.027117,0.032784
...,...,...
6043,0.001077,-0.001037
6044,-2.652431,-0.011984
6045,7.646673,1.102760
6046,-0.022493,0.020927


In [17]:
'''
    Because the training and testing strips the data of the name labels of the fields it is required to recombine 
    the model predictions with the field and station labels, as shown here below.

''' 

# First combine on index, so that the labels and predictions get properly reassigned
comp_df = pd.merge(df_true_P, df_y, left_index=True, right_index=True)
comp_df = comp_df.drop(columns=['M_VALUE_P_x', 'M_VALUE_Q_x'])
comp_df = comp_df.rename(columns={"M_VALUE_P_y": "M_VALUE_P", "M_VALUE_Q_y": "M_VALUE_Q"})

# set indices of predictions to be same as indices of true value
df_xg_predict = df_xg_predict.set_index(comp_df.index)
df_gp_predict = df_gp_predict.set_index(comp_df.index)

# then add predictions on index to the dataframe
comp_df = pd.merge(df_xg_predict, comp_df, left_index=True, right_index=True)
comp_df = pd.merge(df_gp_predict, comp_df, left_index=True, right_index=True)
comp_df

station_comp_df = comp_df[comp_df["STATION"]=="Dtn"]
final_comp_df = station_comp_df[station_comp_df["FIELD"]=="INSTAL2"]
final_comp_df

limit=60
xg_pred_P = final_comp_df["M_VALUE_P_x"].to_list()[:limit]
gp_pred_P = final_comp_df["M_VALUE_P"].to_list()[:limit]
true_P = final_comp_df["M_VALUE_P_y"].to_list()[:limit]

plt.figure(figsize=(12,6))
plt.title("Comparing power prediction for Dtn, INSTAL2 trained on 3 weeks data from different substations")
plt.plot(xg_pred_P, label='XGBoost power prediction')
plt.plot(gp_pred_P, label='Gaussian Process power prediction')
plt.plot(true_P, label = 'True active power')
plt.legend()
plt.xlabel("Timestep (per 5 min)")
plt.ylabel("Power (MW)")
plt.savefig('temporal_2weeks.png')
plt.show()

NameError: name 'df_true_P' is not defined