In [2]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import torch
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, r_regression
from sklearn.model_selection import train_test_split
import numpy as np

from src.dataloader import DataLoader


In [4]:
# set output to just 2 sig digits
np.set_printoptions(precision=2)

# Import the dataframe from our dataloader class
df = DataLoader().get_data_frame(date="Jun22_2020")

feature_names = list(df.columns)[1:]

# X and y
X = df.iloc[:, 1:].values
y = df['yield'].values

# train test Split (0.7/0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=26, shuffle=True)

# test validate split (0.3 split into 0.15/0.15)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.5, random_state=26, shuffle=True)
# Important Note! Feature selection should be done AFTER test/train split, to avoid information leakage


# Standard scaling features
Standardization of features according to: x'=(x-µ)/ σ. Sets mean to 0 and unit variance to 1. <br>

Z-score Feature Scaling: https://developers.google.com/machine-learning/data-prep/transform/normalization

Greek symbols in ascii are produced like so:<br>
µ = Alt + 230 <br>
σ = Alt + 229 <br>
Φ = Alt + 232 <br>
<br>
https://www.keynotesupport.com/internet/special-characters-greek-letters-symbols.shtml

In [5]:
#  train the scaler ONLY on the training set. Then use it to scale train/test/val
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train) # trains the scaler using fit on X_train, then transforms X_train as well
X_test = scaler.transform(X_test) # no fit, transforms using data from fit() stored in the scaler
X_val = scaler.transform(X_val)


### Univariate Selection, SelectKBest
[sklearn.feature_selection.SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest) will select features according to the k highest scores. <br><br>

<br>
[sklearn.feature_selection.r_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.r_regression.html#sklearn.feature_selection.r_regression) will compute Pearson's correlation coefficient for each feature and the target. This is a scoring function to be fed into a feature selection procedure, and does not stand on its own. Use with SelectKBest or similar method. The cross correlation between each regressor and the target is computed as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).

In [6]:
print("SelectKBest, using an Pearson's correlation coefficient scoring function:")

test = SelectKBest(score_func=r_regression, k=4)
fit = test.fit(X_train, y_train)
features = fit.transform(X_train)

print("correlation coefficients:\n",fit.scores_)

print("best 4 features:\n",test.get_feature_names_out(feature_names))

SelectKBest, using an Pearson's correlation coefficient scoring function:
correlation coefficients:
 [ 0.11 -0.1  -0.11 -0.04  0.06  0.01 -0.05 -0.11 -0.02 -0.15 -0.18 -0.06
  0.03 -0.01 -0.08 -0.19 -0.04  0.01 -0.07  0.07  0.06 -0.05 -0.01 -0.04
 -0.06  0.08  0.06 -0.04 -0.01 -0.03  0.03  0.03  0.1   0.06  0.3  -0.15
 -0.04 -0.07 -0.02 -0.04 -0.06 -0.02  0.04 -0.05 -0.04]
best 4 features:
 ['blue-blue-444' 'green-531-red' 'nir-red-edge' 'nir-red-edge-740']


In [None]:

# convert variables to PyTorch tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
print(X_train.shape)