In [13]:
import csv
import os

import matplotlib.pyplot as plt
import numpy as np
from CustomGDRegression import CustomGDRegressor

In [14]:
def loadDataMulti(fileName, firstInputVariabName, secondInputVariabName, outputVariabName):
    filePath = os.path.join(os.getcwd(), 'data', fileName)
    data = []

    with open(filePath) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            data.append(row)

    radiusIdx = 2
    textureIdx = 3
    diagnosisIdx = 1

    valid_indices = []
    for i in range(len(data)):
        try:
            float(data[i][radiusIdx])
            float(data[i][textureIdx])

            valid_indices.append(i)
        except (Exception):
            continue

    inputs = [[float(data[i][radiusIdx]), float(data[i][textureIdx])] for i in valid_indices]
    outputs = [1 if data[i][diagnosisIdx] == 'M' else 0 for i in valid_indices]

    return inputs, outputs

In [16]:
def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

In [17]:
def plotCheckLinearityMulti(inputs, outputs, firstInputLabel, secondInputLabel, outputLabel):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    first_feature = [data_point[0] for data_point in inputs]
    second_feature = [data_point[1] for data_point in inputs]

    ax.scatter(first_feature, second_feature, outputs, c='r', marker='o')

    ax.set_xlabel(firstInputLabel)
    ax.set_ylabel(secondInputLabel)
    ax.set_zlabel(outputLabel)
    ax.set_title('3D Plot of Data')
    plt.show()


In [18]:
def splitDataTrainingValidation(inputs, outputs):
    np.random.seed(1)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    validationSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]

    validationInputs = [inputs[i] for i in validationSample]
    validationOutputs = [outputs[i] for i in validationSample]
    return trainInputs, trainOutputs, validationInputs, validationOutputs

In [19]:
def plotTrainingValidationDataMulti(trainInputs, trainOutputs, validationInputs, validationOutputs,
                                    firstInputLabel, secondInputLabel, outputLabel):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    train_first_feature = [data_point[0] for data_point in trainInputs]
    train_second_feature = [data_point[1] for data_point in trainInputs]

    validation_first_feature = [data_point[0] for data_point in validationInputs]
    validation_second_feature = [data_point[1] for data_point in validationInputs]

    ax.scatter(train_first_feature, train_second_feature, trainOutputs,
               c='r', marker='o', label='Training Data')
    ax.scatter(validation_first_feature, validation_second_feature, validationOutputs,
               c='g', marker='^', label='Validation Data')

    ax.set_xlabel(firstInputLabel)
    ax.set_ylabel(secondInputLabel)
    ax.set_zlabel(outputLabel)
    ax.set_title('3D Plot of Training and Validation Data')
    ax.legend()
    plt.show()


In [20]:
def getModelRegressorAndParametersMulti(trainInputs, trainOutputs):
    regressor = CustomGDRegressor()
    regressor.fit(trainInputs, trainOutputs)

    w0 = regressor.intercept_
    w1 = regressor.coef_[0]
    w2 = regressor.coef_[1]

    return regressor, w0, w1, w2


In [21]:
def plotLearntModelMulti(trainInputs, trainOutputs, w0, w1, w2,
                         firstInputLabel, secondInputLabel, outputLabel):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    first_feature = [data_point[0] for data_point in trainInputs]
    second_feature = [data_point[1] for data_point in trainInputs]

    ax.scatter(first_feature, second_feature, trainOutputs, c='r', marker='o', label='Training Data')

    x1_min, x1_max = min(first_feature), max(first_feature)
    x2_min, x2_max = min(second_feature), max(second_feature)

    xx1, xx2 = np.meshgrid(
        np.linspace(x1_min, x1_max, 10),
        np.linspace(x2_min, x2_max, 10)
    )

    z = w0 + w1 * xx1 + w2 * xx2

    ax.plot_surface(xx1, xx2, z, alpha=0.5, color='blue')

    ax.set_xlabel(firstInputLabel)
    ax.set_ylabel(secondInputLabel)
    ax.set_zlabel(outputLabel)
    ax.set_title('3D Plot of Training Data and Learnt Model')
    ax.legend()
    plt.show()


In [22]:
def computeValidationsMulti(regressor, validationInputs):
    return [1 if pred >= 0.5 else 0 for pred in regressor.predict(validationInputs)]


In [23]:

def plotCompareRealAndComputedDataMulti(validationInputs, validationOutputs, computedValidationOutputs,
                                     firstInputLabel, secondInputLabel, outputLabel):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    validation_first_feature = [data_point[0] for data_point in validationInputs]
    validation_second_feature = [data_point[1] for data_point in validationInputs]

    ax.scatter(validation_first_feature, validation_second_feature, validationOutputs,
               c='g', marker='^', label='Real Validation Data')

    ax.scatter(validation_first_feature, validation_second_feature, computedValidationOutputs,
               c='y', marker='o', label='Computed Validation Data')

    ax.set_xlabel(firstInputLabel)
    ax.set_ylabel(secondInputLabel)
    ax.set_zlabel(outputLabel)
    ax.set_title('Computed vs Real Validation Data')
    ax.legend()
    plt.show()

In [24]:
def predictLiniarRegressionMulti(fileName, firstInputVariabName, secondInputVariabName, outputVariabName,
                                 firstInputLabel, secondInputLabel, outputLabel):
    inputs, outputs = loadDataMulti(fileName, firstInputVariabName, secondInputVariabName, outputVariabName)

    first_feature = [data_point[0] for data_point in inputs]
    second_feature = [data_point[1] for data_point in inputs]

    plotDataHistogram(first_feature, firstInputVariabName)
    plotDataHistogram(second_feature, secondInputVariabName)
    plotDataHistogram(outputs, outputVariabName)

    plotCheckLinearityMulti(inputs, outputs, firstInputVariabName, secondInputVariabName, outputLabel)


    trainInputs, trainOutputs, validationInputs, validationOutputs = splitDataTrainingValidation(inputs, outputs)

    plotTrainingValidationDataMulti(trainInputs, trainOutputs, validationInputs, validationOutputs,
                                 firstInputVariabName, secondInputVariabName, outputLabel)


    regressor, w0, w1, w2 = getModelRegressorAndParametersMulti(trainInputs, trainOutputs)

    plotLearntModelMulti(trainInputs, trainOutputs, w0, w1, w2,
                      firstInputVariabName, secondInputVariabName, outputLabel)

    computedValidationOutputs = computeValidationsMulti(regressor, validationInputs)
    plotCompareRealAndComputedDataMulti(validationInputs, validationOutputs, computedValidationOutputs,
                                     firstInputLabel, secondInputLabel, outputLabel)
    return regressor
