In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core import display as ICD
import seaborn as sns
import glob
import scipy
import os
pd.set_option('display.max_columns', 100)

In [2]:
# import scikit learn packages

from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline

In [3]:
DATA_FOLDER = './data_extractor_scripts/'

### Regression matrix manipulation

Here we are going to handle the albedo case if needed

In [None]:
# tot_df[tot_df['Albedo [-]']==-np.inf]
#tot_df.index[np.isinf(tot_df).any(1)]
#tot_df.columns.to_series()[np.isinf(tot_df).any()]
#plt.plot(tot_df['Albedo [-]'].drop(np.inf))
#tot_df[~tot_df.isin([np.inf, -np.inf]).any(1)]['Albedo [-]'].plot()

Importing regression matrix

In [None]:
tot_df=pd.read_csv()

Transform absolute value and direction in vector components

In [None]:
# create columns with coordinate velocities output
tot_df['u_x']=tot_df['u']*np.cos(np.radians(tot_df['direction']))
tot_df['u_y']=tot_df['u']*np.sin(np.radians(tot_df['direction']))

In [None]:
# create columns with coordinate velocities input top mast anemometer
tot_df['u_top_x']=tot_df['u_top']*np.cos(np.radians(tot_df['direction_top']))
tot_df['u_top_y']=tot_df['u_top']*np.sin(np.radians(tot_df['direction_top']))

In [None]:
# drop the columns which are not used anymore
tot_df=tot_df.drop(columns=['u', 'u_top', 'direction', 'direction_top'])

### Splitting Data

In [None]:
X = np.array(tot_df.drop(columns=['u_x', 'u_y']))
y = np.array(tot_df[['u_x', 'u_y']])

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.5, random_state=42)

### Regression

In [None]:
#plot functions
def plot_ys(y_pred,y_te,interval=[100,200]):

    fig=plt.figure(figsize=(16,12))

    plt.subplot(221)
    plt.gca().set_title('u_x')
    plt.plot(y_pred[interval[0]:interval[1],0],'r-',label='u_x_pred')
    plt.plot(y_te[interval[0]:interval[1],0],'b-',label='u_x_test')
    plt.xlabel('t')
    plt.ylabel('u_x')
    plt.legend()

    plt.subplot(222)
    plt.gca().set_title('u_y')
    plt.plot(y_pred[interval[0]:interval[1],1],'r-',label='u_y_pred')
    plt.plot(y_te[interval[0]:interval[1],1],'b-',label='u_y_test')
    plt.xlabel('t')
    plt.ylabel('u_y')
    plt.legend()

    plt.show()

Optimize the ridge regression parameter

In [None]:
alphas=np.logspace(-1,10,100)

#leave one out method is used
clf_cv = RidgeCV(alphas, cv=None, store_cv_values=True)
clf_cv.fit(X_tr, y_tr) 

# number of datapoints, number of targets, number of alphas
# it already contains the mse
mse_all_folds=clf_cv.cv_values_
mse_means=np.mean(mse_all_folds, axis=0)
ax=plt.gca()
u_x,=ax.plot(alphas,mse_means[0],'r-',label='MSE u_x')
u_y,=ax.plot(alphas,mse_means[1],'b-',label='MSE u_y')
plt.xscale('log')
plt.xlabel('Alphas')
plt.ylabel('MSE')
ax.legend()

Optimize the ridge regression parameter with standardization

In [None]:
alphas=np.logspace(-1,10,100)

#stadardize the data

scaler = StandardScaler()
#define mean and standard based on the test data
scaler.fit(X_tr)
X_tr_scaled=scaler.transform(X_tr)
X_te_scaled=scaler.transform(X_te)


#leave one out method is used
clf_cv = RidgeCV(alphas, cv=None, store_cv_values=True)
clf_cv.fit(X_tr_scaled, y_tr) 

# number of datapoints, number of targets, number of alphas
# it already contains the mse
mse_all_folds=clf_cv.cv_values_
mse_means=np.mean(mse_all_folds, axis=0)
ax=plt.gca()

u_x_mse_train,=ax.plot(alphas,mse_means[0],'r-',label='MSE u_x train')
u_y_mse_train,=ax.plot(alphas,mse_means[1],'b-',label='MSE u_y train')
u_mean_mse_train,=ax.plot(alphas,np.mean(mse_means, axis=0),'g-',label='MSE u mean train')

mse_u_test=[]

for alpha in alphas:
    
    clf = Ridge(alpha)
    clf.fit(X_tr_scaled, y_tr) 
    mse_u_test.append(mean_squared_error(y_te,clf.predict(X_te_scaled)))

u_mse_test,=ax.plot(alphas,mse_u_test,'g--',label='MSE u mean test')

plt.xscale('log')
plt.xlabel('Alphas')
plt.ylabel('MSE')
ax.legend()

In [None]:
plot_ys(clf_cv.predict(X_te_scaled),y_te)

Optimize the ridge regression parameter with standardization and polynomial feature expansion

In [None]:
alphas=np.logspace(-10,10,100)
degree=5

#define pipeline
model = make_pipeline(StandardScaler(),PolynomialFeatures(degree), RidgeCV(alphas))

model.fit(X_tr,y_tr)
y_pred3 = model.predict(X_te)

mse = mean_squared_error(y_te,y_pred3)
mse

In [None]:
plot_ys(y_pred3,y_te,[200,300])

Ridge regression with PCA

In [None]:
alphas=np.logspace(-10,10,100)

#define pipeline
model = make_pipeline(PCA(n_components = 3), RidgeCV(alphas))

model.fit(X_tr,y_tr)
y_pred4 = model.predict(X_te)

mse = mean_squared_error(y_te,y_pred4)
mse
#0.31212712287932676

In [None]:
plot_ys(y_pred4,y_te)

Get the mean squared error