In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# for error checking we are installing following
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('BostonHousing.csv')

In [3]:
df

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,496,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,391,19.2,393.29,17.60
169,497,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,391,19.2,396.90,21.14
170,499,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.90,12.92
171,501,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33


In [6]:
df.shape
# to check (rows, columns)

(173, 14)

In [7]:
df.ndim
# to check n dimensions

2

In [10]:
# to print all columns
df.columns

Index(['ID', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad',
       'tax', 'ptratio', 'black', 'lstat'],
      dtype='object')

In [15]:
# let we have to predict median value i.e. so will create a new data frame
x = df[['ID', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad',
     'ptratio', 'black', 'lstat']]

In [16]:
x

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,15.2,396.90,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,496,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,19.2,393.29,17.60
169,497,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,19.2,396.90,21.14
170,499,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,19.2,396.90,12.92
171,501,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,19.2,396.90,14.33


In [17]:
y = df['tax']

In [18]:
y

0      242
1      222
2      311
3      311
4      311
      ... 
168    391
169    391
170    391
171    391
172    273
Name: tax, Length: 173, dtype: int64

In [19]:
# whenever we train the main dataset we split it into 75% goes to training and 25% goes to testing
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42)

In [20]:
# Once successfully created now create a model
model = LinearRegression()
# give data to model for training
model.fit(x_train, y_train)

In [21]:
# above model is successfully trained
y_pred = model.predict(x_test)

In [22]:
y_pred

array([657.65902967, 403.16661403, 278.7348074 , 294.22754926,
       325.84458827, 677.80327916, 403.40386212, 278.50881778,
       324.26213611, 331.47610971, 612.96179468, 299.56565203,
       359.9287678 , 330.94310325, 206.53323486, 303.72306822,
       293.08244207, 283.05586507, 335.18343128, 684.52767219,
       278.6525941 , 428.36208916, 676.90306619, 211.45429991,
       243.04609269, 293.21955925, 646.79409794, 277.87667043,
       278.39098703, 665.20648267, 207.6445582 , 296.142564  ,
       318.17916406, 678.71342668, 300.26661173, 677.78798893,
       664.61131363, 665.76502096, 370.98250725, 648.84814785,
       317.62178679, 368.80022682, 316.16060517, 349.12356109])

In [23]:
# now we have to compare to check how much it correct
model.score(x_train, y_train)

0.9089186488224404

In [24]:
# above we got accuracy value . Now similarly check for testing data
model.score(x_test, y_test)

0.8919567310129252

In [25]:
# Now we will se how much error is coming. We will use mean squared error method
mean_squared_error(y_test, y_pred)

3209.3415382431103

In [28]:
np.sqrt(mean_squared_error(y_test, y_pred))

56.651050636710266