# Dataset Exploration: Boston House Pricing
* source: http://www.neural.cz/dataset-exploration-boston-house-pricing.html

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## We will load the Boston dataset directly instead of getting it via sklearn

In [None]:
df = pd.read_csv('data/Boston.csv', sep='\t')

In [None]:
# count data points and features (attributes)
instance_count, attr_count = df.shape

In [None]:
instance_count

In [None]:
attr_count

In [None]:
df.head()

## Legend
* __`CRIM`__ = per capita crime rate by town
* __`ZN`__ = proportion of residential land zoned for lots over 25,000 sq. ft.
* __`INDUS`__ = proportion of non-retail business acres per town
* __`CHAS`__ = Charles River dummy variable
* __`NOX`__ = nitrogen oxides concentration
* __`RM`__ = avg. rooms per dwelling
* __`AGE`__ = proportion of owner-occupied units built prior to 1940
* __`DIS`__ = weighted mean of distances to five Boston employment centers
* __`RAD`__ = index of accessibility to radial highways
* __`TAX`__ = full-value property-tax rate per $10,000
* __`PTRATIO`__ = pupil-teacher ratio by town
* __`LSTAT`__ = lower status of the population (percent)
* __`MEDV`__ = median home value of owner occupied homes in thousands

## We can use __`.describe()`__ to get descriptive statistics about our data

In [None]:
df.describe()

## Let's look for correlations between features

In [None]:
# pandas offers three correlation coefficients via the corr() function:
# Pearson, Spearman rank correlation, and Kendall Tau rank correlation
# We'll use Pearson...
pearson = df.corr(method='pearson')
pearson

## Let's look at correlation with target

In [None]:
corr_with_target = pearson.iloc[-1][:-1]
corr_with_target

In [None]:
predictivity = corr_with_target.sort_values(inplace=False, ascending=False)
predictivity

## Strong negative correlations are important too

In [None]:
corr_with_target[abs(corr_with_target).argsort()[::-1]]

In [None]:
# It might be interesting to select some strong correlations between
# attribute pairs. With a bit of Python magic it is possible:
attrs = pearson.iloc[:-1, :-1] # all except target
# only important correlations and not auto-correlations
threshold = 0.5
# {('LSTAT', 'TAX'): 0.543993, ('INDUS', 'RAD'): 0.595129, ...
important_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]) \
    .unstack().dropna().to_dict()
#     attribute pair  correlation
# 0     (AGE, INDUS)     0.644779
# 1     (INDUS, RAD)     0.595129
# ...

unique_important_corrs = pd.DataFrame(
    list(set([(tuple(sorted(key)), important_corrs[key]) \
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.iloc[
    abs(unique_important_corrs['correlation']).argsort()[::-1]]

unique_important_corrs

## Let's Visualize!

In [None]:
%matplotlib inline
import seaborn as sns # heatmap replaces corrplot from original post
sns.set(rc={'figure.figsize':(11, 8)})
# Using all correlations
sns.heatmap(pearson, annot=True); 

In [None]:
# display annotations and change the colors...
sns.heatmap(pearson, cmap='coolwarm', annot=True); 

In [None]:
# Generate a mask for the upper triangle / values above the identity diagonal
# Remove use of the mask below to see the "whole" heatmap
mask = np.zeros_like(pearson, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Add square parameter to make cells square, use the mask, remove annot
sns.heatmap(pearson, cmap='coolwarm', mask=mask, square=True); 

In [None]:
sns.pairplot(df);

## You take it from here...
* Create a linear regression to predict __`MEDV`__ using your choice of features
* Evaluate your model using MAE (Mean Absolute Error)

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error