In [15]:
import pyarrow.feather as feather
import pandas as pd
from datetime import datetime
import numpy as np
import math

import sklearn.gaussian_process as gp
import matplotlib.pyplot as plt

In [16]:
df_IU = feather.read_feather('df_IU.feather')
df_PQ = feather.read_feather('df_PQ.feather')

In [17]:
df_PQ

Unnamed: 0,DATUM_TIJD,STATION,P_0,Q_0,P_1,Q_1,P_2,Q_2,P_3,Q_3,...,P_20,Q_20,P_21,Q_21,P_22,Q_22,P_23,Q_23,P_24,Q_24
0,2021-01-01 00:00:00,Tex,0.00,0.00,1.29,-0.53,0.75,0.10,0.87,-0.43,...,,,,,,,,,,
1,2021-01-01 00:05:00,Tex,-0.07,0.07,0.00,0.00,0.71,-0.28,0.75,-0.09,...,,,,,,,,,,
2,2021-01-01 00:10:00,Tex,0.67,-0.28,0.00,0.00,-8.07,1.88,-0.07,0.07,...,,,,,,,,,,
3,2021-01-01 00:15:00,Tex,1.24,-0.52,1.06,-0.20,0.40,0.01,0.86,-0.43,...,,,,,,,,,,
4,2021-01-01 00:20:00,Tex,1.24,-0.52,-0.07,0.07,-3.79,0.84,0.40,0.01,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051014,2021-12-31 23:35:00,Lw,3.68,-8.41,16.93,-3.69,,,,,...,,,,,,,,,,
1051015,2021-12-31 23:40:00,Lw,3.87,-8.54,16.73,-3.70,,,,,...,,,,,,,,,,
1051016,2021-12-31 23:45:00,Lw,3.15,-8.55,16.84,-3.70,,,,,...,,,,,,,,,,
1051017,2021-12-31 23:50:00,Lw,3.23,-8.45,16.60,-3.75,,,,,...,,,,,,,,,,


In [18]:
df_IU.fillna(0, inplace=True)
df_PQ.fillna(0, inplace=True)

In [19]:
# converts a time amount in 'nanoseconds' to an amount in '5 minutes'
def ns_to_5m(x):
    return x/(pow(10,9)*60*5)

In [20]:
# sort values on "DATUM_TIJD"
df_IU = df_IU.sort_values("DATUM_TIJD")
df_PQ = df_PQ.sort_values("DATUM_TIJD")

# .value returns time in nanoseconds, starting form unix time.
# Get starting timestamp and convert this to '5 minutes'
start_time = ns_to_5m(df_IU["DATUM_TIJD"].iloc[1].value)

# Convert each DATETIME timestamp to a float value representing the amount of 5 minutes since start time
df_IU["DATUM_TIJD"] = df_IU['DATUM_TIJD'].apply(lambda x: ns_to_5m(x.value)-start_time)
df_PQ["DATUM_TIJD"] = df_PQ['DATUM_TIJD'].apply(lambda x: ns_to_5m(x.value)-start_time)

In [21]:
# Reduce timescale
two_weeks = 12*24*7
df_IU = df_IU[df_IU["DATUM_TIJD"] <= two_weeks]
df_PQ = df_PQ[df_PQ["DATUM_TIJD"] <= two_weeks]

In [22]:
# Split df_IU and df_PQ in train and test, rename them to X and y.
stations = df_IU["STATION"].unique()
half_point = int(0.5*len(stations))
train_stations = stations[:half_point]
test_stations = stations[half_point:]

# Subsetting
X_train = df_IU.loc[df_IU["STATION"].isin(train_stations)]
X_test = df_IU.loc[df_IU["STATION"].isin(test_stations)]

y_train = df_PQ.loc[df_PQ["STATION"].isin(train_stations)]
y_test = df_PQ.loc[df_PQ["STATION"].isin(test_stations)]

# Reset indices
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)

y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

# Delete STATION column
del y_train["DATUM_TIJD"], y_test["DATUM_TIJD"] 
del X_train["STATION"], X_test["STATION"], y_train["STATION"], y_test["STATION"] 

In [37]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn import svm

from sklearn.multioutput import MultiOutputRegressor
    
# Accuracy P sign: 0.626
def linear_regression(X_train, y_train, X_test):
    model = MultiOutputRegressor(LinearRegression())
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict

# Accuracy P sign: 0.748
def KNN_regression(X_train, y_train, X_test):
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train, y_train)
    
    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict
    
# Accuracy P sign: 0.906, alpha = 10
def GaussianProcess_regression(X_train, y_train, x_test, alpha):
    model = GaussianProcessRegressor(alpha = alpha)
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict

# Accuracy P sign: 0.674
def RF_regression(X_train, y_train, x_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict


# Accuracy P sign: 0.672 with alpha = 200
def ridge_regression(X_train, y_train, x_test, alpha):
    model = Ridge(alpha)
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict

# With multiOutputRegression (So one regressor per target)
# Accuracy P sign: 
def SVM_regression(X_train, y_train, x_test):
    model = MultiOutputRegressor(svm.SVR())
    model.fit(X_train, y_train)

    predict = model.predict(X_test)
    df_predict = pd.DataFrame(predict, columns = y_test.columns, dtype = float)
    return df_predict


In [24]:
def predict_sign(df_predict, y_test):
    df_sign = df_predict.copy()
    df_y_sign = y_test.copy()
    for col in df_sign.columns:
        df_sign[col] = df_sign[col].apply(lambda x: -1 if x<0 else 1)
        df_y_sign[col] = df_y_sign[col].apply(lambda x: -1 if x<0 else 1)

    df_new_sign = df_sign == df_y_sign
    
    field_accuracies = []
    for col in df_new_sign.columns:
        field_accuracies.append(df_new_sign[col].value_counts(normalize=True).values[0])

    P_accuracies = field_accuracies[::2]
    Q_accuracies = field_accuracies[1::2]
    
    P_avg = sum(P_accuracies)/len(P_accuracies)
    Q_avg = sum(Q_accuracies)/len(Q_accuracies)
    
    return P_avg, Q_avg

In [25]:
def plot_alpha(X_train,y_train, X_test, y_test):
    alpha_range = np.arange(0,10,0.1)
    Ps = []
    for alpha in alpha_range:
        print(alpha)
        df_predict = ridge_regression(X_train, y_train, X_test, alpha)
        P, _ = predict_sign(df_predict, y_test)
        Ps.append(P)
    
    plt.plot(alpha_range, Ps)
    plt.show()

In [26]:
# Without MultiOutputRegressor()
df_predict = linear_regression(X_train, y_train, X_test)
P,Q = predict_sign(df_predict, y_test)
print(P, Q)

0.6635954387704511 0.6231036192364898


In [46]:
# With MultiOutputRegressor()
alpha = 10
df_predict = GaussianProcess_regression(X_train, y_train, X_test, alpha)
P,Q = predict_sign(df_predict, y_test)
print(P, Q)

0.9069231531978186 0.783341596430342


In [43]:
print(P, Q)

0.9069866137828458 0.7834129895884978
