# Univariate/Simple Linear Regression 
A simple linear regression implementation using single predictor.

Dataset Used: Sweden Auto Insurance Dataset

    X = number of claims
    Y = total payment for all the claims in thousands of Swedish Kronor for geographical zones in Sweden
        

In [1]:
# import dependencies here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
df = pd.read_excel("sweden_autoinsurance.xls")

*** No CODEPAGE record, no encoding_override: will use 'ascii'


<IPython.core.display.Javascript object>

In [3]:
df.head()

Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


<IPython.core.display.Javascript object>

### Steps:

    1. Calculate mean
    2. Calculate variance
    3. Calculate covariance
    4. Calculate coefficients
    5. Predict

In [4]:
# function to split dataset into training and test data
def split_data(dataset):
    rows = dataset.shape[0]  # returns number of rows
    train_data = dataset[0 : int(0.7 * rows)]
    test_data = dataset[int(0.7 * rows) :]
    return train_data, test_data

<IPython.core.display.Javascript object>

In [5]:
# function to calculate mean
def mean(values):
    return sum(values) / (len(values))

<IPython.core.display.Javascript object>

In [6]:
# function to calculate the variance
def variance(values, mean):
    var = 0.0
    for value in values:
        var += (value - mean) ** 2
    return var / len(values)

<IPython.core.display.Javascript object>

In [7]:
# function to calculate covariance
def covariance(x, x_mean, y, y_mean):
    cov = 0.0
    for i in range(len(x)):
        cov += (x[i] - x_mean) * (y[i] - y_mean)
    return cov / len(x)

<IPython.core.display.Javascript object>

In [8]:
# estimating the coefficient
def coef(cov, var, y_mean, x_mean):
    b1 = cov / var
    b0 = y_mean - b1 * (x_mean)
    return b0, b1

<IPython.core.display.Javascript object>

In [9]:
# predict (y hat = b0 + b1(x))
def predictions(b0, b1, test_data):
    y_predictions = []
    for row in test_data["X"]:
        predicted_y = b0 + b1 * (row)
        y_predictions.append(predicted_y)
    return y_predictions

<IPython.core.display.Javascript object>

In [10]:
# univariate linear regression implementation


def univ_linear_regression(train, test):

    # estimating coefficients (b0 and b1) using train data
    mean_x = mean(train["X"])
    print("Mean of X Values: ", mean_x)

    mean_y = mean(train["Y"])
    print("Mean of Y Values: ", mean_y)

    var = variance(train["X"], mean_x)
    print("Variance of X values: ", var)

    cov = covariance(train["X"], mean_x, train["Y"], mean_y)
    print("Covariance of X and Y values: ", cov)

    b0, b1 = coef(cov, var, mean_x, mean_y)
    print("Coefficients: B0 and B1: ", b0, b1)

    # making predictions on the test data
    y_predictions = predictions(b0, b1, test)
    print("\nPredicted Y Values: ", y_predictions)

    return y_predictions

<IPython.core.display.Javascript object>

In [11]:
train, test = split_data(df)
print("Training Data\n", train)
print("Test Data\n", test)
test = test.reset_index()
test_y_predictions = univ_linear_regression(train, test)

Training Data
       X      Y
0   108  392.5
1    19   46.2
2    13   15.7
3   124  422.2
4    40  119.4
5    57  170.9
6    23   56.9
7    14   77.5
8    45  214.0
9    10   65.3
10    5   20.9
11   48  248.1
12   11   23.5
13   23   39.6
14    7   48.8
15    2    6.6
16   24  134.9
17    6   50.9
18    3    4.4
19   23  113.0
20    6   14.8
21    9   48.7
22    9   52.1
23    3   13.2
24   29  103.9
25    7   77.5
26    4   11.8
27   20   98.1
28    7   27.9
29    4   38.1
30    0    0.0
31   25   69.2
32    6   14.6
33    5   40.3
34   22  161.5
35   11   57.2
36   61  217.6
37   12   58.1
38    4   12.6
39   16   59.6
40   13   89.9
41   60  202.4
42   41  181.3
43   37  152.8
Test Data
      X      Y
44  55  162.8
45  41   73.4
46  11   21.3
47  27   92.6
48   8   76.1
49   3   39.9
50  17  142.1
51  13   93.0
52  13   31.9
53  15   32.1
54   8   55.6
55  29  133.3
56  30  194.5
57  24  137.9
58   9   87.4
59  31  209.8
60  14   95.5
61  53  244.6
62  26  187.5
Mean of X Values:  

<IPython.core.display.Javascript object>

In [12]:
error = 0.0

for i in range(len(test["Y"])):
    print(test["Y"][i])
    print(test_y_predictions[i])
    error += (test_y_predictions[i] - test["Y"][i]) ** 2

print("\nError: ", error)
mse = error / len(test["Y"])
print("MSE: ", mse)
print("RMSE: ", np.sqrt(mse))

162.8
-106.99919414891849
73.4
-155.4340709066905
21.3
-259.22309253048763
92.6
-203.8689476644625
76.1
-269.6019946928673
39.9
-286.90016496350023
142.1
-238.4652882057282
93.0
-252.3038244222345
31.9
-252.3038244222345
32.1
-245.38455631398136
55.6
-269.6019946928673
133.3
-196.94967955620933
194.5
-193.49004550208278
137.9
-214.24784982684218
87.4
-266.1423606387408
209.8
-190.0304114479562
95.5
-248.84419036810792
244.6
-113.91846225717163
187.5
-207.32858171858908

Error:  2117106.207646454
MSE:  111426.64250770812
RMSE:  333.8062948892788


<IPython.core.display.Javascript object>