# How to compute pairwise distance when having missing value?

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import nan_euclidean_distances
from scipy.spatial.distance import squareform, pdist

## The easist way, when we are free of NA, I'd like to use pdist function

In [3]:
a = np.random.randn(3,5)

In [4]:
a

array([[ 2.2834488 , -0.74877306,  2.3029904 , -0.33272168,  0.62174965],
       [-0.89000248,  0.4347403 ,  1.97113721, -1.15321046, -1.84173417],
       [ 0.32945355, -0.20609533,  0.83525747, -1.11246698, -0.5230511 ]])

In [5]:
# pdist will return a dense distance matrix
pdist(a)

array([4.28060484, 2.86101521, 2.22003405])

# you can convert to a square distance matrix
squareform(pdist(a))

# What if we have NA value?

In [12]:
# if you want to know more about NA value, refer to trick 2 jupyter notebook in the same folder
a[1,3] = np.nan

In [13]:
a

array([[ 2.2834488 , -0.74877306,  2.3029904 , -0.33272168,  0.62174965],
       [-0.89000248,  0.4347403 ,  1.97113721,         nan, -1.84173417],
       [ 0.32945355, -0.20609533,  0.83525747, -1.11246698, -0.5230511 ]])

In [14]:
# np.nan (a float object) will be converted to np.float64

In [19]:
type(a[1,3])

numpy.float64

## Theoretically, sklearn pairwise distance should be able to do that, there is a force_all_finite argument.

In [20]:
pairwise_distances(X=a)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

You see, It doesn't work, because the missing value has to be in the form of np.inf, np.nan and pd.NA

# What is the workaround?


In [21]:
# first using nan_euclidean_distances to compute 
test = nan_euclidean_distances(X=a,Y=a)

In [22]:
test

array([[0.        , 4.69712359, 2.86101521],
       [4.69712359, 0.        , 2.48165548],
       [2.86101521, 2.48165548, 0.        ]])

In [23]:
# make sure it is sysmetric
test_sym = np.tril(test) + np.tril(test,k=-1).T

In [24]:
test_sym


array([[0.        , 4.69712359, 2.86101521],
       [4.69712359, 0.        , 2.48165548],
       [2.86101521, 2.48165548, 0.        ]])

In [25]:
# make sure the main diagonal is 0
np.fill_diagonal(test_sym,0)

In [26]:
test_sym

array([[0.        , 4.69712359, 2.86101521],
       [4.69712359, 0.        , 2.48165548],
       [2.86101521, 2.48165548, 0.        ]])

In [27]:
# convert to dense distance matrix using squareform
squareform(test_sym)

array([4.69712359, 2.86101521, 2.48165548])