# Mahalanobis Distance

## Mahalanobis distance is the distance between two points in a multivariate space. It’s  used in statistical analyses to find outliers that involve serval variables.


## Formula: d(p,q) = √(p1-q1)^2 + (p2-q2)^2

In [1]:
import numpy as np
import scipy as stats
from scipy.stats import chi2

import warnings
warnings.filterwarnings("ignore") 

# yfinance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
symbol = 'TATAMOTORS.NS'

start = '2020-01-01'
end = '2022-01-01'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 00:00:00+05:30,185.149994,186.699997,183.600006,184.449997,184.449997,25968357
2020-01-02 00:00:00+05:30,185.0,194.699997,184.600006,193.75,193.75,57289863
2020-01-03 00:00:00+05:30,192.899994,195.649994,189.25,191.100006,191.100006,47572728
2020-01-06 00:00:00+05:30,191.0,191.0,185.050003,185.649994,185.649994,28621212
2020-01-07 00:00:00+05:30,187.0,189.399994,182.300003,184.699997,184.699997,34995035


In [3]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-27 00:00:00+05:30,465.700012,472.450012,460.100006,471.149994,471.149994,12557565
2021-12-28 00:00:00+05:30,475.200012,482.799988,472.649994,480.200012,480.200012,22860916
2021-12-29 00:00:00+05:30,478.75,481.600006,474.25,476.0,476.0,10263884
2021-12-30 00:00:00+05:30,472.549988,476.399994,468.600006,470.399994,470.399994,11923961
2021-12-31 00:00:00+05:30,472.700012,483.600006,471.850006,482.399994,482.399994,15541446


In [5]:
dataset = dataset.drop(['Adj Close', 'Volume'], axis=1)
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 00:00:00+05:30,185.149994,186.699997,183.600006,184.449997
2020-01-02 00:00:00+05:30,185.0,194.699997,184.600006,193.75
2020-01-03 00:00:00+05:30,192.899994,195.649994,189.25,191.100006
2020-01-06 00:00:00+05:30,191.0,191.0,185.050003,185.649994
2020-01-07 00:00:00+05:30,187.0,189.399994,182.300003,184.699997


In [6]:
def mahalanobis_distance(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()

In [7]:
df = mahalanobis_distance(x=dataset, data=dataset)
df

array([  0.83797494,   1.90229023,   0.34237554,   0.74284389,
         0.26653764,   0.89303376,   1.32650028,   0.55311617,
         0.82208339,   0.44748501,   0.75085857,   0.73690268,
         0.67931884,   1.28934121,   0.48632484,   0.93720258,
         0.47314023,   0.54090679,   0.4540582 ,   1.55759724,
         2.1746904 ,   0.59485752,   3.6467514 ,   0.472195  ,
         1.07745758,   9.27359373,   0.64309324,   0.64366785,
         0.7341504 ,   0.85024699,   0.8442805 ,   0.58563326,
         1.9654714 ,   0.80978278,   2.02336283,   2.55976792,
         0.71924711,   0.81637312,   0.71761334,   0.76484788,
         0.98537546,   4.23670551,   2.59344143,   1.26496601,
         4.74637812,   0.8587081 ,   1.07610174,   1.31628369,
         2.56814515,   2.91685675,   6.71709649,   1.94849477,
         2.9400704 ,   2.11919026,   2.70632498,   2.12682735,
         2.83997943,   3.77153342,   1.95966216,   1.85290373,
         2.77143391,   1.86059118,   2.29446682,   2.04

In [8]:
dataset = dataset.reset_index(drop=True)

In [9]:
dataset.head()

Unnamed: 0,Open,High,Low,Close
0,185.149994,186.699997,183.600006,184.449997
1,185.0,194.699997,184.600006,193.75
2,192.899994,195.649994,189.25,191.100006
3,191.0,191.0,185.050003,185.649994
4,187.0,189.399994,182.300003,184.699997


In [10]:
dataset['mahalanobis'] = mahalanobis_distance(x=dataset, data=dataset[['Open', 'High', 'Low', 'Close']])
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis
0,185.149994,186.699997,183.600006,184.449997,0.837975
1,185.0,194.699997,184.600006,193.75,1.90229
2,192.899994,195.649994,189.25,191.100006,0.342376
3,191.0,191.0,185.050003,185.649994,0.742844
4,187.0,189.399994,182.300003,184.699997,0.266538


In [11]:
dataset['p'] = 1 - chi2.cdf(dataset['mahalanobis'], 4)
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis,p
0,185.149994,186.699997,183.600006,184.449997,0.837975,0.933286
1,185.0,194.699997,184.600006,193.75,1.90229,0.753724
2,192.899994,195.649994,189.25,191.100006,0.342376,0.986917
3,191.0,191.0,185.050003,185.649994,0.742844,0.945942
4,187.0,189.399994,182.300003,184.699997,0.266538,0.991871
