### LinearRegression - Predicting Output Of Insurance Company

In [2]:
#Required Imports
import pandas as pd

In [3]:
data = pd.read_csv('Insurance.csv')
data.head()

Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


In [4]:
#Lets Implement Few Function That Will Be Handy For Calculatios
#y = b0 + b1 * x

def mean(values):
    return sum(values)/float(len(values))

#The Variance Is The Sum Of Squared For Each value From The Mean Value.
def variance(values, mean):
    return sum([(val-mean)**2 for val in values])

dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
var_x, var_y = variance(x, mean_x), variance(y, mean_y)
print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

x stats: mean=3.000 variance=10.000
y stats: mean=2.800 variance=8.800


In [6]:
# The covariance of two groups of numbers describes how those numbers change together. 
#Covariance is a generalization of correlation. 
#Correlation describes the relationship between two groups of numbers, 
#whereas covariance can describe the relationship between two or more groups of numbers.

def covariance(X, Y, mean_x, mean_y):
    covar = 0.0
    for i in range(len(X)):
        covar += (X[i] - mean_x)*(Y[i] - mean_y)
    return covar


dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
X = [row[0] for row in dataset]
Y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
covar = covariance(X, Y, mean_x, mean_y)
print('Covariance: %.3f' % (covar))

Covariance: 8.000


In [8]:
#B1 = sum((x(i) - mean(x)) * (y(i) - mean(y))) / sum( (x(i) - mean(x))^2 )
#B1 = Covariance(x, y) / variance(x)

def coefficients(dataset):
    X = [row[0] for row in dataset]
    Y = [row[1] for row in dataset]
    
    mean_x, mean_y = mean(x), mean(y)
    b1 = covariance(X, Y, mean_x, mean_y)/variance(X, mean_x)
    b0 = mean_y - b1*mean_x
    return [b0, b1]

dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

Coefficients: B0=0.400, B1=0.800


In [18]:
#Below is a function named simple_linear_regression() 
#that implements the prediction equation to make predictions on a test dataset. 
#It also ties together the estimation of the coefficients on training data from the steps above.

def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [29]:
#As part of this example, we will also add in a function to manage the evaluation of the predictions 
#called evaluate_algorithm() and another function to estimate the 
#Root Mean Squared Error of the predictions called rmse_metric().
from math import sqrt

def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)
 
# Evaluate regression algorithm on training dataset
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

In [30]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print('RMSE: %.3f' % (rmse))

[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
RMSE: 0.693


In [34]:
#Lets Apply This Algorithm To Insurance Dataset.
from random import seed
from random import randrange
from csv import reader
from math import sqrt


#Load Csv File
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as f:
        csv_reader = reader(f)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

print(load_csv('Insurance.csv'))

[['108', '392.5'], ['19', '46.2'], ['13', '15.7'], ['124', '422.2'], ['40', '119.4'], ['57', '170.9'], ['23', '56.9'], ['14', '77.5'], ['45', '214'], ['10', '65.3'], ['5', '20.9'], ['48', '248.1'], ['11', '23.5'], ['23', '39.6'], ['7', '48.8'], ['2', '6.6'], ['24', '134.9'], ['6', '50.9'], ['3', '4.4'], ['23', '113'], ['6', '14.8'], ['9', '48.7'], ['9', '52.1'], ['3', '13.2'], ['29', '103.9'], ['7', '77.5'], ['4', '11.8'], ['20', '98.1'], ['7', '27.9'], ['4', '38.1'], ['0', '0'], ['25', '69.2'], ['6', '14.6'], ['5', '40.3'], ['22', '161.5'], ['11', '57.2'], ['61', '217.6'], ['12', '58.1'], ['4', '12.6'], ['16', '59.6'], ['13', '89.9'], ['60', '202.4'], ['41', '181.3'], ['37', '152.8'], ['55', '162.8'], ['41', '73.4'], ['11', '21.3'], ['27', '92.6'], ['8', '76.1'], ['3', '39.9'], ['17', '142.1'], ['13', '93'], ['13', '31.9'], ['15', '32.1'], ['8', '55.6'], ['29', '133.3'], ['30', '194.5'], ['24', '137.9'], ['9', '87.4'], ['31', '209.8'], ['14', '95.5'], ['53', '244.6'], ['26', '187.5']]

In [39]:
#Function To Convert String To Float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

#Splitting The Dataset Into Training And Testing Sets
def train_test_split(dataset, split):
    train = list()
    train_size = split * float(len(dataset))
    dataset_copy = list(dataset)
    while len(train) < train_size :
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
        
    return train, dataset_copy


def evaluate_algorithm(dataset, algorithm, split):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set)    
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse


seed(1)
# load and prepare data
filename = 'Insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

RMSE: 33.275
