In [14]:
import csv
import os

import matplotlib.pyplot as plt

In [15]:
def loadData(fileName, inputVariabName, outputVariabName):
    filePath = os.path.join(os.getcwd(), 'data', fileName)
    data = []
    dataNames = []
    with open(filePath) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable = dataNames.index(inputVariabName)
    selectedOutput = dataNames.index(outputVariabName)
    valid_indices = [i for i in range(len(data))
                     if data[i][selectedVariable] and float(data[i][selectedVariable]) != 0
                     and data[i][selectedOutput] and float(data[i][selectedOutput]) != 0]
    inputs = [float(data[i][selectedVariable]) for i in valid_indices]
    outputs = [float(data[i][selectedOutput]) for i in valid_indices]

    return inputs, outputs

In [16]:
def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

In [17]:
def plotCheckLinearity(inputs, outputs, inputLabel, outputLabel):
    plt.plot(inputs, outputs, 'ro')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.title(f'{inputLabel} vs. {outputLabel}')
    plt.show()

In [18]:
import random


def splitDataTrainingValidation(inputs, outputs):
    def choice(inputs, size, replace=True):
        if replace:
            return [inputs[random.randint(0, len(inputs) - 1)] for _ in range(size)]
        else:
            inputs_cpy = inputs.copy()
            result = []

            for _ in range(size):
                if not inputs_cpy:
                    break
                idx = random.randint(0, len(inputs_cpy) - 1)
                result.append(inputs_cpy.pop(idx))
            return result

    indexes = [i for i in range(len(inputs))]
    trainSample = choice(indexes, int(0.8 * len(inputs)), replace=False)
    validationSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]

    validationInputs = [inputs[i] for i in validationSample]
    validationOutputs = [outputs[i] for i in validationSample]
    return trainInputs, trainOutputs, validationInputs, validationOutputs

In [19]:
def plotTrainingValidationData(trainInputs, trainOutputs, validationInputs, validationOutputs, inputLabel, outputLabel):
    plt.plot(trainInputs, trainOutputs, 'ro', label='training data')
    plt.plot(validationInputs, validationOutputs, 'g^', label='validation data')
    plt.title('train and validation data')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [20]:
class LinearRegressionCustom:
    def __init__(self):
        self.w0 = 0
        self.w1 = 0

    def fit(self, X, y):
        x_values = [point[0] for point in X]

        n = len(x_values)
        mean_x = sum(x_values) / n
        mean_y = sum(y) / n

        numerator = sum((x_values[i] - mean_x) * (y[i] - mean_y) for i in range(n))
        denominator = sum((x_values[i] - mean_x) ** 2 for i in range(n))

        if denominator == 0:
            self.w1 = 0
        else:
            self.w1 = numerator / denominator
        self.w0 = mean_y - self.w1 * mean_x

        return self

    def predict(self, X):
        return [self.w0 + self.w1 * x[0] for x in X]


In [21]:
def getModelRegressorAndParameters(trainInputs, trainOutputs):
    xx = [[el] for el in trainInputs]
    regressor = LinearRegressionCustom()
    regressor.fit(xx, trainOutputs)
    w0, w1 = regressor.w0, regressor.w1
    return regressor, w0, w1

In [22]:
def plotLearntModel(trainInputs, trainOutputs, w0, w1, inputLabel, outputLabel):
    noOfPoints = 1000
    xref = []
    val = min(trainInputs)
    step = (max(trainInputs) - min(trainInputs)) / noOfPoints
    for i in range(1, noOfPoints):
        xref.append(val)
        val += step
    yref = [w0 + w1 * el for el in xref]

    plt.plot(trainInputs, trainOutputs, 'ro', label='training data')
    plt.plot(xref, yref, 'b-', label='learnt model')
    plt.title('train data and the learnt model')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [23]:
def computeValidations(regressor, validationInputs):
    return regressor.predict([[x] for x in validationInputs])


def plotCompareRealAndComputedData(validationInputs, validationOutputs, computedValidationOutputs, inputLabel,
                                   outputLabel):
    plt.plot(validationInputs, computedValidationOutputs, 'yo', label='computed test data')
    plt.plot(validationInputs, validationOutputs, 'g^', label='real test data')
    plt.title('computed validation and real validation data')
    plt.xlabel(inputLabel)
    plt.ylabel(outputLabel)
    plt.legend()
    plt.show()

In [24]:
def predictLiniarRegressionCustom(fileName, inputVariabName, outputVariabName, inputsLabel, outputLabel):
    inputs, outputs = loadData(fileName, inputVariabName, outputVariabName)
    plotDataHistogram(inputs, inputVariabName)
    plotDataHistogram(outputs, outputVariabName)

    plotCheckLinearity(inputs, outputs, inputsLabel, outputLabel)

    trainInputs, trainOutputs, validationInputs, validationOutputs = splitDataTrainingValidation(inputs, outputs)
    plotTrainingValidationData(trainInputs, trainOutputs, validationInputs, validationOutputs, inputsLabel, outputLabel)

    regressor, w0, w1 = getModelRegressorAndParameters(trainInputs, trainOutputs)
    plotLearntModel(trainInputs, trainOutputs, w0, w1, inputsLabel, outputLabel)

    computedValidationsOutputs = computeValidations(regressor, validationInputs)
    plotCompareRealAndComputedData(validationInputs, validationOutputs, computedValidationsOutputs, inputsLabel,
                                   outputLabel)