In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

# code is modified from https://www.machinelearningplus.com/statistics/mahalanobis-distance/

df = pd.read_csv('./data/diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
# ----- direct calculation ---- #
def pairwise_mahalanobis(x, y, inv_cov_mat):
    x_minus_y = x - y
    left_term = np.dot(x_minus_y, inv_cov_mat)
    inner_root = np.dot(left_term, x_minus_y)
    mah = np.sqrt(inner_root)
    return mah

# --- using scipy implementation --- #

def scipy_mahalanobis(x, y, inv_cov_mat):
    from scipy.spatial.distance import mahalanobis as mahalanobis_scipy
    return mahalanobis_scipy(x, y, inv_cov_mat)

## check the result

data = df[['carat', 'depth', 'price']]
cov_mat = np.cov(data.values.T)
inv_cov_mat = inv(cov_mat)

# 1st example ~ 2nd example
df_x = df[['carat', 'depth', 'price']].head(500)
ex_1 = df_x.iloc[0].values
ex_2 = df_x.iloc[2].values

direct_dist_1_and_2 = pairwise_mahalanobis(ex_1, ex_2, inv_cov_mat)
scipy_dist_1_and_2 = scipy_mahalanobis(ex_1, ex_2, inv_cov_mat)

print('Direct: ', direct_dist_1_and_2)
print('Scipy: ', scipy_dist_1_and_2)


Direct:  3.2265490475366025
Scipy:  3.2265490475366025
