# Dataset Exploration: Boston House Pricing
## Bohumír Zámečník
http://www.neural.cz/dataset-exploration-boston-house-pricing.html


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# We will load the Boston dataset directly instead of getting
# it through sklearn.
df = pd.read_csv('data/Boston.csv')

In [None]:
# count data points and features (attributes)
instance_count, attr_count = df.shape

In [None]:
instance_count

In [None]:
attr_count

In [None]:
df.head()

In [None]:
# CRIM = per capita crime rate by town
# ZN = proportion of residential land zoned for lots over 25,000 sq. ft.
# INDUS = proportion of non-retail business acres per town
# CHAS = Charles River dummy variable
# NOX = nitrogen oxides concentration
# RM = avg. rooms per dwelling
# AGE = proportion of owner-occupied units built prior to 1940
# DIS = weighted mean of distances to five Boston employment centers
# RAD = index of accessibility to radial highways
# TAX = full-value property-tax rate per $10,000
# PTRATIO = pupil-teacher ratio by town
# LSTAT = lower status of the population (percent)

In [None]:
df.describe()

In [None]:
# pandas offers three correlation coefficients via the corr() function:
# Pearson, Spearman rank correlation, and Kendall Tau rank correlation
# We'll use Pearson...

pearson = df.corr(method='pearson')
pearson

In [None]:
# Let's look at correlation with target/answer
corr_with_target = pearson.iloc[-1][:-1]
corr_with_target

In [None]:
predictivity = corr_with_target.sort_values(inplace=False, ascending=False)

In [None]:
predictivity

In [None]:
# strong negative correlations are important too...
corr_with_target[abs(corr_with_target).argsort()[::-1]]

In [None]:
# It might be interesting to select some strong correlations between
# attribute pairs. With a bit of Python magic it is possible:
attrs = pearson.iloc[:-1, :-1] # all except target
# only important correlations and not auto-correlations
threshold = 0.5
# {('LSTAT', 'TAX'): 0.543993, ('INDUS', 'RAD'): 0.595129, ...
important_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]) \
    .unstack().dropna().to_dict()
#     attribute pair  correlation
# 0     (AGE, INDUS)     0.644779
# 1     (INDUS, RAD)     0.595129
# ...

unique_important_corrs = pd.DataFrame(
    list(set([(tuple(sorted(key)), important_corrs[key]) \
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.iloc[
    abs(unique_important_corrs['correlation']).argsort()[::-1]]

In [None]:
unique_important_corrs

## Let's Visualize

In [None]:
%matplotlib inline
import seaborn as sns  #heatmap replaces corrplot
sns.set(rc={'figure.figsize':(11, 8)})
# Using all correlations
sns.heatmap(pearson, annot=True); 

In [None]:
# display annotations and change the colors...
sns.heatmap(pearson, cmap='coolwarm', annot=True); 

In [None]:
# Generate a mask for the upper triangle / values above the identity diagonal
# Remove use of the mask below to see the "whole" heatmap
mask = np.zeros_like(pearson, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Add square parameter to make cells square, use the mask, remove annot
sns.heatmap(pearson, cmap='coolwarm', mask=mask, square=True); 

In [None]:
attr = df['MEDV']
plt.hist(attr);

In [None]:
plt.hist(attr, bins=50);

In [None]:
sns.distplot(attr);

### For integer-valued data (e.g., categories) automatic quantization into a pre-defined number of bins might not be the best option.
### We'd like to quantize according the original distinct values. For that we can just compute this kind of histogram ourselves and use the bar plot.
* Example for __RAD__ int (category) - index of accessibility to radial highways:

In [None]:
cat_attr = df['RAD']
h = cat_attr.value_counts()
values, counts = h.index, h
plt.bar(values, counts);

In [None]:
plt.scatter(df['DIS'], df['MEDV']);

In [None]:
x, y = df['DIS'], df['MEDV']
plt.scatter(x, y, alpha=0.5)

# or via jointplot (with histograms aside):
sns.jointplot(x, y, kind='scatter', joint_kws={'alpha':0.5});

In [None]:
sns.jointplot(df['DIS'], df['MEDV'], kind='hex');

In [None]:
#sns.kdeplot(df['DIS'], df['MEDV'], shade=True)
# or 
sns.jointplot(df['DIS'], df['MEDV'], kind='kde');

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import mean_absolute_error
y_hat = model.predict(test_X)