In [52]:
import csv
import os
from CustomGDRegression import CustomGDRegressor

import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model

In [53]:
def loadData(fileName, inputVariabName, outputVariabName):
    filePath = os.path.join(os.getcwd(), 'data', fileName)
    data = []
    dataNames = []
    with open(filePath) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable = dataNames.index(inputVariabName)
    selectedOutput = dataNames.index(outputVariabName)
    valid_indices = [i for i in range(len(data))
                     if data[i][selectedVariable] and float(data[i][selectedVariable]) != 0
                     and data[i][selectedOutput] and float(data[i][selectedOutput]) != 0]
    inputs = [float(data[i][selectedVariable]) for i in valid_indices]
    outputs = [float(data[i][selectedOutput]) for i in valid_indices]

    return inputs, outputs

In [54]:
def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

In [55]:
def plotCheckLinearity(inputs, outputs, inputLabel, outputLabel):
    plt.plot(inputs, outputs, 'ro')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.title(f'{inputLabel} vs. {outputLabel}')
    plt.show()

In [56]:
def splitDataTrainingValidation(inputs, outputs):
    np.random.seed(1)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    validationSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]

    validationInputs = [inputs[i] for i in validationSample]
    validationOutputs = [outputs[i] for i in validationSample]
    return trainInputs, trainOutputs, validationInputs, validationOutputs

In [57]:
def plotTrainingValidationData(trainInputs, trainOutputs, validationInputs, validationOutputs, inputLabel, outputLabel):
    plt.plot(trainInputs, trainOutputs, 'ro', label='training data')
    plt.plot(validationInputs, validationOutputs, 'g^', label='validation data')
    plt.title('train and validation data')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [59]:
def plotLearntModel(trainInputs, trainOutputs, w0, w1, inputLabel, outputLabel):
    noOfPoints = 1000
    xref = []
    val = min(trainInputs)
    step = (max(trainInputs) - min(trainInputs)) / noOfPoints
    for i in range(1, noOfPoints):
        xref.append(val)
        val += step
    yref = [w0 + w1 * el for el in xref]

    plt.plot(trainInputs, trainOutputs, 'ro', label='training data')
    plt.plot(xref, yref, 'b-', label='learnt model')
    plt.title('train data and the learnt model')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [58]:
def getModelRegressorAndParameters(trainInputs, trainOutputs):
    xx = [[el] for el in trainInputs]
    regressor = CustomGDRegressor()
    regressor.fit(xx, trainOutputs)
    w0, w1 = regressor.intercept_, regressor.coef_[0]
    return regressor, w0, w1

In [None]:
def computeValidations(regressor, validationInputs):
    return regressor.predict([[x] for x in validationInputs])


def plotCompareRealAndComputedData(validationInputs, validationOutputs, computedValidationOutputs, inputLabel,
                                   outputLabel):
    plt.plot(validationInputs, computedValidationOutputs, 'yo', label='computed test data')
    plt.plot(validationInputs, validationOutputs, 'g^', label='real test data')
    plt.title('computed validation and real validation data')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [60]:
from sklearn.metrics import mean_squared_error


def printError(computedTestOutputs, validationOutputs):
    error = 0.0
    for t1, t2 in zip(computedTestOutputs, validationOutputs):
        error += (t1 - t2) ** 2
    error = error / len(validationOutputs)
    print("prediction error (manual): ", error)

    error = mean_squared_error(validationOutputs, computedTestOutputs)
    print("prediction error (tool): ", error)

In [61]:
def predictLiniarRegression(fileName, inputVariabName, outputVariabName, inputsLabel, outputLabel):
    inputs, outputs = loadData(fileName, inputVariabName, outputVariabName)
    plotDataHistogram(inputs, inputVariabName)
    plotDataHistogram(outputs, outputVariabName)

    plotCheckLinearity(inputs, outputs, inputsLabel, outputLabel)

    input_mean, input_std = np.mean(inputs), np.std(inputs)
    output_mean, output_std = np.mean(outputs), np.std(outputs)

    inputs = [(x - input_mean) / input_std for x in inputs]
    outputs = [(y - output_mean) / output_std for y in outputs]

    plotDataHistogram(inputs, f"Normalized {inputVariabName}")
    plotDataHistogram(outputs, f"Normalized {outputVariabName}")

    trainInputs, trainOutputs, validationInputs, validationOutputs = splitDataTrainingValidation(inputs, outputs)
    plotTrainingValidationData(trainInputs, trainOutputs, validationInputs, validationOutputs, inputsLabel, outputLabel)

    regressor, w0, w1 = getModelRegressorAndParameters(trainInputs, trainOutputs)
    plotLearntModel(trainInputs, trainOutputs, w0, w1, inputsLabel, outputLabel)

    computedValidationsOutputs = computeValidations(regressor, validationInputs)
    plotCompareRealAndComputedData(validationInputs, validationOutputs, computedValidationsOutputs, inputsLabel,
                                   outputLabel)
    printError(computedValidationsOutputs, validationOutputs)