# Chapter 10 - Exercise Solutions - Applied

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
from statsmodels.datasets import get_rdataset
from ISLP import load_data

np.get_printoptions() # 75 default
np.set_printoptions(linewidth=300)

## 7

In this chapter, we mentioned the use of correlation-based distance and Euclidean distance as dissimilarity measures for hierarchical clustering. It turns out that these two measures are almost equivalent: if each observation has been centered to have mean zero and standard deviation one, and if we let $r_{ij}$ denote the correlation between the $i_{th}$ and $j_{th}$ observations, then the quantity $1 − r_{ij}$ is proportional to the squared Euclidean distance between the $i_{th}$ and $j_{th}$ observations.

On the `USArrests` data, show that this proportionality holds.

*Hint:* The Euclidean distance can be calculated using the `pairwise_distances()` function from the `sklearn.metrics` module, and correlations can be calculated using the `np.corrcoef()` function.

In [12]:
USArrests = get_rdataset('USArrests').data
USArrests.head()

Unnamed: 0_level_0,Murder,Assault,UrbanPop,Rape
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,13.2,236,58,21.2
Alaska,10.0,263,48,44.5
Arizona,8.1,294,80,31.0
Arkansas,8.8,190,50,19.5
California,9.0,276,91,40.6


In [116]:
scaler = StandardScaler(with_std=True, with_mean=True)
USArrests_scaled = scaler.fit_transform(USArrests.T).T # Transpose to normalize across features (not observations)
USArrests_scaled.shape

(50, 4)

In [91]:
corr_matrix = np.corrcoef(USArrests_scaled) # Correlation between observations (rows of x)
corr_matrix

array([[1.        , 0.99092502, 0.99856984, ..., 0.96759749, 0.58753266, 0.98911539],
       [0.99092502, 1.        , 0.98969621, ..., 0.93489972, 0.51058785, 0.96749454],
       [0.99856984, 0.98969621, 1.        , ..., 0.97553899, 0.61849688, 0.99361584],
       ...,
       [0.96759749, 0.93489972, 0.97553899, ..., 1.        , 0.7724072 , 0.99407361],
       [0.58753266, 0.51058785, 0.61849688, ..., 0.7724072 , 1.        , 0.7001675 ],
       [0.98911539, 0.96749454, 0.99361584, ..., 0.99407361, 0.7001675 , 1.        ]])

In [101]:
eucdist_appx = np.sqrt(1 - corr_matrix)
eucdist_appx

array([[1.49011612e-08, 9.52626680e-02, 3.78174278e-02, ..., 1.80006984e-01, 6.42236202e-01, 1.04329324e-01],
       [9.52626680e-02, 1.49011612e-08, 1.01507591e-01, ..., 2.55147560e-01, 6.99579980e-01, 1.80292712e-01],
       [3.78174278e-02, 1.01507591e-01, 1.49011612e-08, ..., 1.56400161e-01, 6.17659386e-01, 7.99009508e-02],
       ...,
       [1.80006984e-01, 2.55147560e-01, 1.56400161e-01, ..., 1.49011612e-08, 4.77066870e-01, 7.69830457e-02],
       [6.42236202e-01, 6.99579980e-01, 6.17659386e-01, ..., 4.77066870e-01, 0.00000000e+00, 5.47569632e-01],
       [1.04329324e-01, 1.80292712e-01, 7.99009508e-02, ..., 7.69830457e-02, 5.47569632e-01, 1.49011612e-08]])

In [102]:
eucdist = pairwise_distances(USArrests_scaled, metric='euclidean')
eucdist

array([[0.        , 0.26944351, 0.10696384, ..., 0.50913664, 1.81651829, 0.29508789],
       [0.26944351, 0.        , 0.28710682, ..., 0.72166628, 1.97871099, 0.5099448 ],
       [0.10696384, 0.28710682, 0.        , ..., 0.44236646, 1.74700456, 0.22599402],
       ...,
       [0.50913664, 0.72166628, 0.44236646, ..., 0.        , 1.34934887, 0.21774093],
       [1.81651829, 1.97871099, 1.74700456, ..., 1.34934887, 0.        , 1.5487608 ],
       [0.29508789, 0.5099448 , 0.22599402, ..., 0.21774093, 1.5487608 , 0.        ]])

In [94]:
eucdist/eucdist_appx

  eucdist/eucdist_appx


array([[0.        , 2.82842712, 2.82842712, ..., 2.82842712, 2.82842712, 2.82842712],
       [2.82842712, 0.        , 2.82842712, ..., 2.82842712, 2.82842712, 2.82842712],
       [2.82842712, 2.82842712, 0.        , ..., 2.82842712, 2.82842712, 2.82842712],
       ...,
       [2.82842712, 2.82842712, 2.82842712, ..., 0.        , 2.82842712, 2.82842712],
       [2.82842712, 2.82842712, 2.82842712, ..., 2.82842712,        nan, 2.82842712],
       [2.82842712, 2.82842712, 2.82842712, ..., 2.82842712, 2.82842712, 0.        ]])

In [115]:
I = np.identity(USArrests.shape[0])
ones = np.ones((USArrests.shape[0],USArrests.shape[0]))

offset = (( (eucdist + I) / (eucdist_appx + I) ) - I )/2.82842712 + I # Extra Is to avoid div by 0; should me a (n x n) matrix of ones
np.allclose(offset, ones)

True

For this dataset, the proportionality holds; the approximation is offset by a proportional factor of 2.82842712.