# Multiple Linear Regression


In [3]:
# import all the required libraries and put matplotlib in inline mode to plot on the notebook
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

In [4]:
def compute_polynomial_model(x, coef, intercept):
    """Compute the polynomial given the input x, the intercept and the coefficients"""
    min_x = min(x)
    max_x = max(x)
    xp = np.arange(min_x, max_x, (max_x-min_x)/100.0)

    x = xp
    yp = intercept

    for w in coef:
        yp = yp + w * x
        x = x * xp
    return xp,yp

In [5]:
def generate_variables(original_column, degree):
    """Generates the list of variables for a polynomial approximator"""
    v = [original_column]
    for d in range(2,degree+1):
        v.append(original_column+str(d))
    return v

In [6]:
def add_degrees(dataset,original_column, degree):
    """Given a column name and the degree of a polynomial, it adds 
    all the extra columns needed for a polynomial approximator"""
    
    new_dataset = dataset
    current_column = original_column
    for d in range(2,degree+1):
        column_name = original_column+str(d)

        new_dataset[column_name] = new_dataset[current_column]*dataset[original_column]

        current_column = column_name

    return new_dataset



In [7]:
def plot_approximator(x,y,xp,yp,title=""):
    """Plots the original data (x,y) and a set of point (xp,yp) showing the model approximation"""
    font = {'family' : 'sans',
        'size'   : 14}
    plt.rc('font', **font)

    plt.scatter(x, y,  color='blue')
    plt.plot(xp, yp, color='red', linewidth=3)
    plt.xlabel("LSTAT")
    plt.ylabel("MEDV")
    
    if (title!=""):
        plt.title(title)

    plt.xlim([0,40])
    plt.ylim([0,60])
    plt.show()

## Read the data already imputed

In [8]:
dataset = pd.read_csv('./data/dataset_imputed.csv')
dataset.columns
dataset.describe()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Rain,Snow,Thunderstorm,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
count,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,...,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0,511683.0
mean,1374.191908,0.028936,0.830006,0.382839,7976.056789,5.729829,259.852115,4064.780288,12301.934999,14053.262375,...,0.606581,0.080767,0.064632,0.560437,0.012678,0.283908,0.142977,0.566972,0.004223,0.428805
std,216.428751,0.167627,0.375628,0.48608,11507.493361,3.354778,186.267465,2734.751409,9073.078515,2758.942613,...,0.488509,0.272477,0.245875,0.496334,0.11188,0.450893,0.35005,0.495495,0.06485,0.494906
min,1000.0,0.0,0.0,0.0,47.0,0.0,0.0,0.0,344.0,9893.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1188.0,0.0,1.0,0.0,1057.0,3.0,166.0,2575.0,7215.0,11849.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1374.0,0.0,1.0,0.0,3307.0,6.0,252.0,4026.0,9643.0,15017.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,1561.0,0.0,1.0,1.0,9673.0,9.0,346.0,5547.0,15566.0,15931.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
max,1748.0,1.0,1.0,1.0,85070.0,10.0,2206.0,26641.0,32221.0,23931.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# compute the data inputs
dataset_train_x = dataset[generate_variables('StoreID',1)].values
print(dataset_train_x.shape)
print(1)


x = dataset_train_x.reshape(506, 1)

# compute the data output
dataset_train_y = dataset.Region.values
y = dataset_train_y.reshape(506, 1)

# apply simple linear regression to fit the data
regr = linear_model.LinearRegression()
regr.fit(x, y)

# model output for the input data
yp = regr.predict(x)

# compute the model output as a line
xm,ym = compute_polynomial_model(x[:,0],regr.coef_, regr.intercept_)

# compute rss cost
rss = sum((yp-y)*(yp-y))
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y, yp))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, yp))

# the cost as R^2
r2 = regr.score(x,y)

title = "degree = 1 : RSS = "+str(round(rss[0],1)) + " R2="+str(round(r2,2))

# plot the result
plot_approximator(x[:,0],y,xm,ym,title)



(511683, 1)
1


ValueError: cannot reshape array of size 511683 into shape (506,1)