Preprocessing:
    
    - Timestamp, merging dataframes
    - Handling the missing data   
    - Feature scaling
    - Feature encoding
    
Modeling:

    - Train,test,split, kfold
    - Linear Regression
    - column transformer
    - Pipeline
    
Evaluation:

    - Evaluation metrics
    - Visualization
    


In [12]:
####################################################
## Getting the data

from influxdb import InfluxDBClient # install via "pip install influxdb"
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np

client = InfluxDBClient(host='influxus.itu.dk', port=8086, username='lsda', password='icanonlyread')
client.switch_database('orkney')


def get_df(results):
    values = results.raw["series"][0]["values"]
    columns = results.raw["series"][0]["columns"]
    df = pd.DataFrame(values, columns=columns).set_index("time")
    df.index = pd.to_datetime(df.index) # Convert to datetime-index
    return df


In [13]:
######################################## fetching the data again ########################################
generation = client.query(
    "SELECT * FROM Generation where time > now()-90d and time <= now()-7"
    ) # Query written in InfluxQL

# Get the last 90 days of weather forecasts with the shortest lead time
wind  = client.query(
    "SELECT * FROM MetForecasts where time > now()-90d and time <= now()-7 and Lead_hours = '1'"
    ) # Query written in InfluxQL

future = client.query(
    "SELECT * FROM MetForecasts where time > now()-7d and time <= now() and Lead_hours = '1'"
    ) 


gen_df = get_df(generation)
wind_df = get_df(wind)
future_df = get_df(future)

gen_df = pd.DataFrame(gen_df['Total'])
wind_df = wind_df[['Direction', 'Speed']]
future_df = future_df[['Direction', 'Speed']]

In [14]:
wind_df.head()

Unnamed: 0_level_0,Direction,Speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-22 21:00:00+00:00,WSW,8.04672
2022-09-23 00:00:00+00:00,W,5.81152
2022-09-23 03:00:00+00:00,W,4.91744
2022-09-23 06:00:00+00:00,WSW,3.12928
2022-09-23 09:00:00+00:00,W,4.02336


In [15]:
future_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 56 entries, 2022-12-14 21:00:00+00:00 to 2022-12-21 18:00:00+00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Direction  56 non-null     object 
 1   Speed      56 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.3+ KB


# Load the dataframes

# convert data metrics

In [16]:
def windVectorX(compass):
    arr=["NNE","NE","ENE","E","ESE", "SE", "SSE","S","SSW","SW","WSW","W","WNW","NW","NNW","N"]
    num = arr.index(compass)
    value =  (num+1)*22.5*np.pi / 180
    x_value = np.sin(value)
    return x_value

def windVectorY(compass):
    arr=["NNE","NE","ENE","E","ESE", "SE", "SSE","S","SSW","SW","WSW","W","WNW","NW","NNW","N"]
    num = arr.index(compass)
    value =  (num+1)*22.5*np.pi / 180
    y_value = np.cos(value)
    return y_value
    

In [17]:
tempx = [windVectorX(i) for i in wind_df['Direction']]
tempy = [windVectorY(i) for i in wind_df['Direction']]
wind_df['windVx'] = tempx
wind_df['windVy'] = tempy
wind_df = wind_df.drop(['Direction'], axis=1)
gen_df = gen_df.resample('3h').mean()
wind_df = pd.concat([wind_df, gen_df], axis=1).dropna()
print(f"Null values:\n {wind_df.isnull().sum()}")
wind_df.to_csv("dataBy09132022.csv")

Null values:
 Speed     0
windVx    0
windVy    0
Total     0
dtype: int64


In [18]:
ftempx = [windVectorX(i) for i in future_df['Direction']]
ftempy = [windVectorY(i) for i in future_df['Direction']]
future_df['windVx'] = ftempx
future_df['windVy'] = ftempy
future_df = future_df.drop(['Direction'], axis=1)
future_df = pd.concat([future_df, gen_df], axis=1).dropna()
print(f"Null values:\n {future_df.isnull().sum()}")
future_df.to_csv("future_df.csv")

Null values:
 Speed     0
windVx    0
windVy    0
Total     0
dtype: int64


In [19]:
wind_df = pd.read_csv('dataBy09132022.csv')
wind_df.head()
future_df = pd.read_csv('future_df.csv')
future_df.head()

Unnamed: 0,time,Speed,windVx,windVy,Total
0,2022-12-14 21:00:00+00:00,13.85824,-0.3826834,0.92388,22.442169
1,2022-12-15 00:00:00+00:00,11.176,-0.3826834,0.92388,9.536772
2,2022-12-15 03:00:00+00:00,11.176,-2.449294e-16,1.0,6.730763
3,2022-12-15 06:00:00+00:00,8.9408,-0.3826834,0.92388,5.817125
4,2022-12-15 09:00:00+00:00,5.81152,-2.449294e-16,1.0,10.927672


## Costum transformer

In [20]:
from sklearn.base import TransformerMixin, BaseEstimator
class transformerX(TransformerMixin, BaseEstimator):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        transformed = x*5
        return transformed
    

In [278]:
    
# # testing the transformer
# # here I make an example
# x_train = pd.DataFrame(np.array([[1, 1], [1, 2], [2, 2], [2, 3]]), columns=['col1', 'col2'])
# y_train = np.dot(x_train, np.array([1, 2])) + 3 ############## no need atm ##############
# x_test = pd.DataFrame(np.array([[3, 5]]), columns=['col1', 'col2']) ############## no need atm ############## 

# # instantiating 
# test = my_basic_transformer()

# # transforming
# test.fit_transform(x_train)

## Column transformer from sklearn

Perform the preprocessing here.
Feature scaling is added here as well.

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler # check the documents what it does
ct = ColumnTransformer([
     # if dataframe is passed you need to specify what column should be transformed.
    ('mms', MinMaxScaler(), ['Speed']),
    ('windvectorx', MinMaxScaler(), ['windVx']),
    ('windvectory', MinMaxScaler(), ['windVy'])  
])

# ct.fit_transform(x_train)

## Pipeline from sklearn
Pass your column transformer and model in the pipeline

In [22]:
wind_df.head()

Unnamed: 0,time,Speed,windVx,windVy,Total
0,2022-09-22 21:00:00+00:00,8.04672,-0.92388,-0.3826834,11.872393
1,2022-09-23 00:00:00+00:00,5.81152,-1.0,-1.83697e-16,8.839425
2,2022-09-23 03:00:00+00:00,4.91744,-1.0,-1.83697e-16,5.877813
3,2022-09-23 06:00:00+00:00,3.12928,-0.92388,-0.3826834,4.722575
4,2022-09-23 09:00:00+00:00,4.02336,-1.0,-1.83697e-16,5.408131


In [23]:
rawY = wind_df['Total']
rawX = wind_df.drop('Total',axis=1)
raw_futureY = future_df['Total']
raw_futureX = future_df.drop('Total',axis=1)

In [73]:
numSplit = 5
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=numSplit)


In [74]:
x_train_list = []
y_train_list= []
x_test_list= []
y_test_list = []
for train_index, test_index in tscv.split(rawX.values):
    #print("TRAIN:", train_index, "TEST:", test_index)
    # x_train = rawX[train_index]
    # x_test = rawX[test_index]
    x_train, x_test = rawX.values[train_index], rawX.values[test_index]
    y_train, y_test = rawY.values[train_index], rawY.values[test_index]
    x_train_list.append(pd.DataFrame(x_train, columns = ['time','Speed','windVx','windVy']))
    y_train_list.append(pd.DataFrame(y_train, columns = ['Total']))
    x_test_list.append(pd.DataFrame(x_test, columns = ['time','Speed','windVx','windVy']))
    y_test_list.append(pd.DataFrame(y_test, columns = ['Total']))

    
    

In [284]:
# rawY = wind_df['Total']
# rawX = wind_df.drop('Total',axis=1)
# x_train, x_test, y_train, y_test = train_test_split(rawX, rawY, test_size=0.2, shuffle=False)

# x_train = np.array(x_train).reshape(-1,1)
# y_train = np.array(y_train).reshape(-1,1)
# x_test = np.array(x_test).reshape(-1,1)

In [26]:
x_train[0]

array(['2022-09-22 21:00:00+00:00', 8.04672, -0.9238795325112868,
       -0.3826834323650895], dtype=object)

In [286]:
ttt = pd.DataFrame(y_train)
ttt.head()

Unnamed: 0,0
0,5.797828
1,6.709056
2,8.390839
3,10.180039
4,11.2906


In [75]:
from sklearn.pipeline import Pipeline

pipeline_linear = Pipeline([
    ('my_ct', ct),
    ('lr', LinearRegression())
])

# print(y_pred)

In [80]:
scores = []
from sklearn.metrics import mean_squared_error as mse
for i in range(len(x_train_list)):
    pipeline_linear.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_linear = pipeline_linear.predict(pd.DataFrame(x_test_list[i]))
    scores.append (pipeline_linear.score(x_train_list[i], y_train_list[i]))
print("the mean of the score for split number: "+str(numSplit))
print(sum(scores)/len(scores))
    # if i == 4:
    #     print(y_pred_linear)

the mean of the score for split number: 5
0.5567132150326446


In [289]:
print()




In [44]:
from sklearn.svm import SVR
pipeline_SVR = Pipeline([
    ('my_ct', ct),
    ('lr', SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1))
])


In [83]:
scores = []
for i in range(len(x_train_list)):
    pipeline_SVR.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_SVR = pipeline_SVR.predict(pd.DataFrame(x_test_list[i]))
    scores.append (pipeline_SVR.score(x_train_list[i], y_train_list[i]))
print("the mean of the score for split number: "+str(numSplit))
print(sum(scores)/len(scores))



the mean of the score for split number: 5
0.5857491387634028


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [62]:
from sklearn.neighbors import KNeighborsRegressor
pipeline_KNN = Pipeline([
    ('my_ct', ct),
    ('lr', KNeighborsRegressor(n_neighbors=5))
])



In [84]:
scores = []
for i in range(len(x_train_list)):
    pipeline_KNN.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_KNN = pipeline_KNN.predict(pd.DataFrame(x_test_list[i]))
    print(f"Score: {round(pipeline_KNN.score(x_train_list[i], y_train_list[i]),2)}")
    scores.append (pipeline_KNN.score(x_train_list[i], y_train_list[i]))
print("the mean of the score for split number: "+str(numSplit))
print(sum(scores)/len(scores))
    


Score: 0.79
Score: 0.73
Score: 0.7
Score: 0.72
Score: 0.72
the mean of the score for split number: 5
0.7341739927301764


In [33]:
from sklearn.tree import DecisionTreeRegressor
pipeline_tree = Pipeline([
    ('my_ct', ct),
    ('lr', DecisionTreeRegressor(random_state=0))
])



In [81]:
scores = []

for i in range(len(x_train_list)):
    pipeline_tree.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_tree = pipeline_tree.predict(pd.DataFrame(x_test_list[i]))
    scores.append (pipeline_tree.score(x_train_list[i], y_train_list[i]))
print("the mean of the score for split number: "+str(numSplit))
print(sum(scores)/len(scores))

the mean of the score for split number: 5
0.8450472335824426


In [296]:
from sklearn.ensemble import RandomForestRegressor
pipeline_random_forest = Pipeline([
    ('my_ct', ct),
    ('lr', RandomForestRegressor(max_depth=10, random_state=0))
])


In [297]:


for i in range(len(x_train_list)):
    pipeline_random_forest.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_random_forest = pipeline_random_forest.predict(pd.DataFrame(x_test_list[i]))
    print(f"Score: {round(pipeline_random_forest.score(x_train_list[i], y_train_list[i]),2)}")
    print(f"\tMean Square Error: {round(mse(y_test_list[i], y_pred_random_forest),2)}")
    print(f"\tRoot Mean Square Error: {round(mse(y_test_list[i], y_pred_random_forest, squared=False),2)}")

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score: 0.85
	Mean Square Error: 24.57
	Root Mean Square Error: 4.96
Score: 0.85
	Mean Square Error: 8.47
	Root Mean Square Error: 2.91


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score: 0.85
	Mean Square Error: 26.07
	Root Mean Square Error: 5.11
Score: 0.85
	Mean Square Error: 7.55
	Root Mean Square Error: 2.75
Score: 0.85
	Mean Square Error: 25.2
	Root Mean Square Error: 5.02


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [298]:
from sklearn.neural_network import MLPRegressor
pipeline_NN = Pipeline([
    ('my_ct', ct),
    ('lr', MLPRegressor(solver='adam',random_state=None, max_iter=5000,early_stopping=False))
])



In [299]:
for i in range(len(x_train_list)):
    pipeline_NN.fit(pd.DataFrame(x_train_list[i]), pd.DataFrame(y_train_list[i]))
    y_pred_NN = pipeline_NN.predict(pd.DataFrame(x_test_list[i]))
    print(f"Score: {round(pipeline_NN.score(x_train_list[i], y_train_list[i]),2)}")
    print(f"\tMean Square Error: {round(mse(y_test_list[i], y_pred_NN),2)}")
    print(f"\tRoot Mean Square Error: {round(mse(y_test_list[i], y_pred_NN, squared=False),2)}")

  y = column_or_1d(y, warn=True)


Score: 0.72
	Mean Square Error: 14.25
	Root Mean Square Error: 3.77


  y = column_or_1d(y, warn=True)


Score: 0.75
	Mean Square Error: 10.51
	Root Mean Square Error: 3.24


  y = column_or_1d(y, warn=True)


Score: 0.75
	Mean Square Error: 20.24
	Root Mean Square Error: 4.5


  y = column_or_1d(y, warn=True)


Score: 0.77
	Mean Square Error: 8.17
	Root Mean Square Error: 2.86


  y = column_or_1d(y, warn=True)


Score: 0.77
	Mean Square Error: 27.58
	Root Mean Square Error: 5.25


In [300]:
raw_futureX.shape

(54, 4)

In [301]:
model = [pipeline_linear,pipeline_SVR,pipeline_tree,pipeline_random_forest,pipeline_KNN,pipeline_NN]
name = ["y_pred_linear","y_pred_SVR","y_pred_tree","y_pred_random_forest","y_pred_KNN","y_pred_NN"]
i=0

for m in model:
    
    future_pred = m.predict(raw_futureX)
    print(f"\t{name[i]}")
    i+=1
    print(f"\tMean Square Error: {round(mse(raw_futureY, future_pred),2)}")
    print(f"\tRoot Mean Square Error: {round(mse(raw_futureY, future_pred, squared=False),2)}")
    
    x_index = np.array(raw_futureX['Speed'])
    y_pred = np.array(future_pred)
    y_real = np.array(raw_futureY)
    # plt.scatter(x_index, y_real, color='b')
    # plt.plot(x_index,y_pred, color='r')
    #plt.show()
    

	y_pred_linear
	Mean Square Error: 19.95
	Root Mean Square Error: 4.47
	y_pred_SVR
	Mean Square Error: 14.92
	Root Mean Square Error: 3.86
	y_pred_tree
	Mean Square Error: 10.86
	Root Mean Square Error: 3.3
	y_pred_random_forest
	Mean Square Error: 10.69
	Root Mean Square Error: 3.27
	y_pred_KNN
	Mean Square Error: 12.02
	Root Mean Square Error: 3.47
	y_pred_NN
	Mean Square Error: 16.09
	Root Mean Square Error: 4.01
