In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(color_codes=True)

from sklearn.datasets import load_boston

## Intro and understanding data 

In [None]:
# This commented code will work only locally
#df = pd.read_csv('housing.data', delim_whitespace=True, header=None)
#col_names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
#             'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
#df.columns = col_names

data, target = load_boston(return_X_y=True) # here boston housing price is loaded from sklearn
df = pd.DataFrame(data, columns=load_boston().feature_names)
df['MEDV'] = target
df.head()

    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per 10K USD
    11. PTRATIO  pupil-teacher ratio by town
    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
    13. LSTAT    % lower status of the population
    14. MEDV     Median value of owner-occupied homes in Thousand USDs

In [None]:
df.describe()

In [None]:
df.hist(edgecolor='yellow', linewidth=1.2, figsize=(16,12))
plt.show()

In [None]:
sns.pairplot(df, height=1.5)
plt.show()

In [None]:
col_study = [ 'ZN', 'INDUS', 'NOX', 'RM']
sns.pairplot(df[col_study], height=3)
plt.show()

In [None]:
col_study2 = ['PTRATIO', 'B', 'LSTAT', 'MEDV']
sns.pairplot(df[col_study2], height=3)
plt.show()

## Correlation and visualizing selected Features

In [None]:
df.corr()

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
df.corr()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df[['TAX', 'PTRATIO', 'LSTAT', 'MEDV']].corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df[['ZN', 'INDUS', 'NOX', 'AGE']].corr(), annot=True)
plt.show()

In [None]:
X = df['RM'].values.reshape(-1,1)
y = df['MEDV'].values.reshape(-1,1)
y.shape

In [None]:
plt.figure(figsize=(12,10))
sns.regplot(X,y)
plt.xlabel('avg num of rooms')
plt.ylabel('Median house value in K')
plt.show()

In [None]:
sns.jointplot(x='RM', y='MEDV', data=df, kind='reg', height=10)
plt.show()

In [None]:
plt.figure(figsize=(12,10))
x_fit = np.arange(3,11)
X_fit = x_fit.reshape(-1,1)
y_fit = model.predict(X_fit)
plt.scatter(df['RM'], df['MEDV'])
plt.plot(x_fit, y_fit)