In [1]:
# Example Simple Linear Regression
from math import sqrt
# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

In [86]:
# Evaluate regression algorithm on training dataset
def evaluate_algorithm1(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse


In [3]:
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))
                           


In [4]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar


In [5]:
# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])


In [41]:
# Calculate coefficients
def coefficients(dataset):
    x = list(map(float,[row[0] for row in dataset]))
    y = list(map(float,[row[1] for row in dataset]))
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]


In [42]:
coefficients(dataset)

[19.99448575911481, 3.4138235600663664]

In [43]:
type(dataset[0][1])

str

In [76]:
# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    dataset_copy = [list(map(float, sublist)) for sublist in dataset_copy]
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
        train=[list(map(float, sublist)) for sublist in train]
    return train, dataset_copy


In [78]:
A,B =train_test_split(dataset, 0.6)

In [79]:
A

[[3.0, 39.9],
 [11.0, 23.5],
 [11.0, 57.2],
 [53.0, 244.6],
 [9.0, 87.4],
 [12.0, 58.1],
 [2.0, 6.6],
 [20.0, 98.1],
 [61.0, 217.6],
 [29.0, 103.9],
 [37.0, 152.8],
 [7.0, 77.5],
 [22.0, 161.5],
 [23.0, 113.0],
 [8.0, 55.6],
 [11.0, 21.3],
 [13.0, 93.0],
 [108.0, 392.5],
 [25.0, 69.2],
 [41.0, 73.4],
 [10.0, 65.3],
 [17.0, 142.1],
 [29.0, 133.3],
 [6.0, 50.9],
 [60.0, 202.4],
 [40.0, 119.4],
 [13.0, 31.9],
 [4.0, 12.6],
 [3.0, 4.4],
 [14.0, 95.5],
 [27.0, 92.6],
 [26.0, 187.5],
 [8.0, 76.1],
 [24.0, 134.9],
 [9.0, 52.1],
 [6.0, 14.8],
 [19.0, 46.2],
 [5.0, 40.3]]

In [80]:
B

[[13.0, 15.7],
 [124.0, 422.2],
 [57.0, 170.9],
 [23.0, 56.9],
 [14.0, 77.5],
 [45.0, 214.0],
 [5.0, 20.9],
 [48.0, 248.1],
 [23.0, 39.6],
 [7.0, 48.8],
 [9.0, 48.7],
 [3.0, 13.2],
 [4.0, 11.8],
 [7.0, 27.9],
 [4.0, 38.1],
 [0.0, 0.0],
 [6.0, 14.6],
 [16.0, 59.6],
 [13.0, 89.9],
 [41.0, 181.3],
 [55.0, 162.8],
 [15.0, 32.1],
 [30.0, 194.5],
 [24.0, 137.9],
 [31.0, 209.8]]

In [81]:
B[0]

[13.0, 15.7]

In [83]:
# Simple linear regression algorithm
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions


In [84]:
simple_linear_regression(A,B)

[67.03771221677779,
 435.33268170361924,
 213.02851093228253,
 100.21743919757432,
 70.35568491485745,
 173.21283855532667,
 40.49393063214057,
 183.16675664956563,
 100.21743919757432,
 47.129876028299876,
 53.765821424459176,
 33.85798523598126,
 37.17595793406092,
 47.129876028299876,
 37.17595793406092,
 23.904067141742303,
 43.81190333022022,
 76.99163031101675,
 67.03771221677779,
 159.94094776300807,
 206.39256553612321,
 73.67365761293709,
 123.4432480841319,
 103.53541189565397,
 126.76122078221154]

In [87]:
# Test simple linear regression
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
rmse = evaluate_algorithm1(dataset, simple_linear_regression)
print('RMSE: %.3f' % (rmse))

[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
RMSE: 0.693


In [None]:
##### example insurance csv

In [107]:
# Example of Simple Linear Regression on the Swedish Insurance Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt


In [108]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [109]:
#Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [125]:
# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    dataset_copy = [list(map(float, sublist)) for sublist in dataset_copy]
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
        train=[list(map(float, sublist)) for sublist in train]
    return train, dataset_copy

In [126]:
 #Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)


In [127]:
a,b=train_test_split(dataset, 0.6)

In [128]:
b

[[124.0, 422.2],
 [57.0, 170.9],
 [14.0, 77.5],
 [10.0, 65.3],
 [23.0, 39.6],
 [2.0, 6.6],
 [3.0, 4.4],
 [23.0, 113.0],
 [6.0, 14.8],
 [9.0, 48.7],
 [9.0, 52.1],
 [3.0, 13.2],
 [29.0, 103.9],
 [7.0, 77.5],
 [5.0, 40.3],
 [16.0, 59.6],
 [13.0, 89.9],
 [60.0, 202.4],
 [41.0, 181.3],
 [11.0, 21.3],
 [27.0, 92.6],
 [15.0, 32.1],
 [9.0, 87.4],
 [31.0, 209.8],
 [53.0, 244.6]]

In [129]:
# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse


In [130]:
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))


In [131]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [132]:
# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])


In [133]:
# Calculate coefficients
def coefficients(dataset):
    x = list(map(float,[row[0] for row in dataset]))
    y = list(map(float,[row[1] for row in dataset]))
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]



In [134]:
# Simple linear regression algorithm
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [135]:
# Simple linear regression on insurance dataset
seed(1)
# load and prepare data
filename = 'insurance_1.csv'
dataset = load_csv(filename)


In [136]:
dataset

[['108', '392.5'],
 ['19', '46.2'],
 ['13', '15.7'],
 ['124', '422.2'],
 ['40', '119.4'],
 ['57', '170.9'],
 ['23', '56.9'],
 ['14', '77.5'],
 ['45', '214'],
 ['10', '65.3'],
 ['5', '20.9'],
 ['48', '248.1'],
 ['11', '23.5'],
 ['23', '39.6'],
 ['7', '48.8'],
 ['2', '6.6'],
 ['24', '134.9'],
 ['6', '50.9'],
 ['3', '4.4'],
 ['23', '113'],
 ['6', '14.8'],
 ['9', '48.7'],
 ['9', '52.1'],
 ['3', '13.2'],
 ['29', '103.9'],
 ['7', '77.5'],
 ['4', '11.8'],
 ['20', '98.1'],
 ['7', '27.9'],
 ['4', '38.1'],
 ['0', '0'],
 ['25', '69.2'],
 ['6', '14.6'],
 ['5', '40.3'],
 ['22', '161.5'],
 ['11', '57.2'],
 ['61', '217.6'],
 ['12', '58.1'],
 ['4', '12.6'],
 ['16', '59.6'],
 ['13', '89.9'],
 ['60', '202.4'],
 ['41', '181.3'],
 ['37', '152.8'],
 ['55', '162.8'],
 ['41', '73.4'],
 ['11', '21.3'],
 ['27', '92.6'],
 ['8', '76.1'],
 ['3', '39.9'],
 ['17', '142.1'],
 ['13', '93'],
 ['13', '31.9'],
 ['15', '32.1'],
 ['8', '55.6'],
 ['29', '133.3'],
 ['30', '194.5'],
 ['24', '137.9'],
 ['9', '87.4'],
 ['31', 

In [137]:
# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

RMSE: 33.630
