In [None]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Figures inline and set visualization style
%matplotlib inline
sns.set()

In [29]:
# Load the dataset
Bostondataset = pd.read_excel("Boston housing dataset.xlsx")
Bostondataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [30]:
# Store target variable of training data (the Y) in a new variable
MEDV = Bostondataset.MEDV
MEDV.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

In [31]:
data = Bostondataset.drop(['MEDV'], axis=1)
#data.info()
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [32]:
#pd.DataFrame.hist(data, figsize = [15,15]);

In [33]:
# Choose several features only
data_5_vars = data[['AGE', 'CRIM', 'INDUS','LSTAT', 'NOX']]
data_5_vars.head()

Unnamed: 0,AGE,CRIM,INDUS,LSTAT,NOX
0,65.2,0.00632,2.31,4.98,0.538
1,78.9,0.02731,7.07,9.14,0.469
2,61.1,0.02729,7.07,4.03,0.469
3,45.8,0.03237,2.18,2.94,0.458
4,54.2,0.06905,2.18,5.33,0.458


In [34]:
# Assign feature matrix as X, and target MEDV value as Y
X = data_5_vars.values
y = MEDV.values


In [35]:
#X.shape
#y.shape

In [36]:
#split the data into 80% training set and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [37]:
#X_train.shape
#X_test.shape
#y_train.shape
#y_test.shape

In [38]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)


In [39]:
# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

# The mean absolute error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred))

# The MAPE
print("Mean absolute percentage error: %.2f" % (np.mean(np.abs((y_test - y_pred) / y_test))*100))

# Explained R_square: 1 is perfect prediction
print('R square: %.2f' % r2_score(y_test, y_pred))


Coefficients: 
 [ 0.04164408 -0.06739454 -0.18345129 -0.96472378  0.94971864]
Mean squared error: 52.37
Mean absolute error: 5.21
Mean absolute percentage error: 26.26
R square: 0.54


In [40]:
# Display the prediction
prediction_output = np.hstack((X_test, y_test.reshape(len(X_test),1)))
prediction_output.shape

(102, 6)

In [41]:
prediction_output = np.hstack((prediction_output, y_pred.reshape(len(X_test),1)))
prediction_output.shape

(102, 7)

In [42]:
#prediction_output.dtype

In [43]:
prediction_output_table = pd.DataFrame(prediction_output, columns=['AGE', 'CRIM', 'INDUS','LSTAT', 'NOX','Real MEDV','Predicted MEDV'])
prediction_output_table.head()

Unnamed: 0,AGE,CRIM,INDUS,LSTAT,NOX,Real MEDV,Predicted MEDV
0,83.4,5.20177,18.1,11.48,0.77,22.7,22.713425
1,36.6,0.04666,1.52,8.61,0.404,30.3,26.574691
2,95.3,0.25387,6.91,30.81,0.448,14.4,6.641351
3,92.6,6.71772,18.1,17.44,0.713,13.4,17.190496
4,56.7,13.0751,18.1,14.76,0.58,20.1,17.726168


In [44]:
# Accuracy in absolute percentage
MAPE = 1 - ((np.fabs(y_test - y_pred)/y_test).mean())
print("Accuracy = ", (MAPE * 100), "%")

Accuracy =  73.74165090362158 %


The final model is:
MEDV = 0.04164408*X1 - 0.06739454*X2 - 0.18345129*X3 - 0.96472378*X4 + 0.94971864*X5

    