In [1]:
import copy

from sklearn.linear_model import LinearRegression
import numpy as np
from typing import Callable
import torch
import torch.nn as nn
import torch.nn.functional as F

In [37]:
# pyTorch neural network class

class myPytorchNetwork(nn.Module):
    def __init__(self, layers):
        super(myPytorchNetwork, self).__init__()
        self.fc_layers = nn.ModuleList([nn.Linear(layers[i], layers[i + 1]) for i in range(len(layers) - 1)])

    def forward(self, x):
        x = torch.flatten(x, 1)

        # TODO: I'm using relus everywhere for now, we need to change it to be adjustable
        for layer in self.fc_layers[:-1]:
            x = F.relu(layer(x))

        # No need for activation in the last layer
        return self.fc_layers[-1](x)


def train_torch_network(network: myPytorchNetwork, dataset, max_epochs, batch_size, train_ratio=0.7,
                        learning_rate=0.001, momentum=0.9, silent=False):
    loss_fn = torch.nn.MSELoss()
    # TODO: think of making the learning rate adaptive here, e.g. by using pytorch LR scheduler
    optimizer = torch.optim.SGD(network.parameters(), lr=learning_rate, momentum=momentum)
    split_index = int(dataset.shape[0] * train_ratio)
    train, validation = dataset[:split_index, :], torch.from_numpy(dataset[split_index:, :])

    best_validation_loss = float("inf")

    for epoch in range(max_epochs):
        np.random.shuffle(train)
        torch_train = torch.from_numpy(train)
        batch_start_idx = 0

        batches_loss = []
        # This way we skip last samples if there are less than batch_size of them
        while batch_start_idx + batch_size <= torch_train.shape[0]:
            optimizer.zero_grad()
            outputs = network(torch_train[batch_start_idx:batch_start_idx + batch_size, :-1])
            loss = loss_fn(torch.flatten(outputs), torch_train[batch_start_idx:batch_start_idx + batch_size, -1])
            batches_loss.append(loss.item())

            loss.backward()

            optimizer.step()
            batch_start_idx += batch_size

        # Now, check the loss on validation dataset
        validation_output = torch.flatten(network(validation[:, :-1]))
        validation_loss = loss_fn(validation_output, validation[:, -1])

        # # If the validation loss starts to increase, stop training
        # if validation_loss > best_validation_loss:
        #     print(f"Epoch: {epoch}. \nValidation loss increased from {best_validation_loss} to {validation_loss}. Stopping training")
        #     return

        best_validation_loss = validation_loss

        if not silent:
            print(
                f"Epoch: {epoch}. \nLoss on training: {np.mean(batches_loss)} \nLoss on validation: {validation_loss} \n##########")


In [85]:
def run_linear_regression(train_dataset, test_inputs) -> np.array:
    """
    Train a linear regression on train_inputs, and predict data for test_inputs
    """
    reg = LinearRegression().fit(train_dataset[:, :-1], train_dataset[:, -1])
    prediction = reg.predict(test_inputs)
    return prediction


def run_pytorch_network(network, train_dataset, test_inputs) -> np.array:
    """
    Train a pytorch network and predict outputs for test_inputs
    """
    # TODO: make this adjustable from the outside
    train_torch_network(network, train_dataset, 50, 10, learning_rate=0.01, silent=True)
    prediction = network(torch.from_numpy(test_inputs)).detach().numpy().flatten()
    return prediction


def measure_model_error_multiple_times(model: Callable, dataset: np.array, train_ratio=0.85, num_runs=5):
    """
    :param model: a callable that accepts train dataset, test inputs, and produces the prediction for test inputs
    :param dataset: dataset to train on of shape (n_samples, n_features), where the last column is the value to be predicted
    :param train_ratio: how much data to put into the training dataset
    :param num_runs: number of runs with reshuffled dataset
    :return: mean MSE through all the runs
    """
    split_index = int(dataset.shape[0] * train_ratio)
    error_sum = 0
    for _ in range(num_runs):
        np.random.shuffle(dataset)
        train = dataset[:split_index, :]
        test = dataset[split_index:, :]

        prediction = model(train, test[:, :-1])
        mse = np.mean((prediction - test[:, -1].flatten()) ** 2)
        print(f"Model MSE on test: {mse}")
        error_sum += mse
    return error_sum / num_runs

In [137]:
# Load the data. TODO: add all the datasets here
turbine = np.genfromtxt("processed_datasets/turbine.csv", dtype=np.float32, delimiter=',', skip_header=1)
turbine_input, turbine_output = turbine[:, :4], turbine[:, 4]

In [138]:
# Trying a couple of times with different splits into training and test data for a better understanging
measure_model_error_multiple_times(run_linear_regression, turbine)

Model MSE on test: 0.0011528179747983813
Model MSE on test: 0.002424795413389802
Model MSE on test: 0.001376274973154068
Model MSE on test: 0.001286662882193923
Model MSE on test: 0.0014885530108585954


0.001545820850878954

In [139]:
# Now trying, maybe applying some functions to features can make the situation better. There is no obvious relation here, but after trying, we found this gives better results (not always, but mostly better)

modified_turbine = turbine.copy()
modified_turbine[:, 1] = modified_turbine[:, 1] ** 2
modified_turbine[:, 0] = modified_turbine[:, 0] ** 2
measure_model_error_multiple_times(run_linear_regression, modified_turbine)

Model MSE on test: 0.0014316097367554903
Model MSE on test: 0.001650119898840785
Model MSE on test: 0.0013921308564022183
Model MSE on test: 0.0017579963896423578
Model MSE on test: 0.0012379592517390847


0.0014939632266759873

In [135]:
turbine.shape

(451, 5)

In [140]:
torch_nn = myPytorchNetwork([4, 8, 10, 1])
measure_model_error_multiple_times(lambda *args: run_pytorch_network(torch_nn, *args), turbine, num_runs=5)

Model MSE on test: 0.0003147642419207841
Model MSE on test: 0.00028935971204191446
Model MSE on test: 0.00017892543110065162
Model MSE on test: 0.00016714140656404197
Model MSE on test: 0.00020081765251234174


0.00023020168882794678