# Relationship between common similarity metrics 
[Reference](https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/)

In [41]:
# Import some stuff
import numpy as np
import pandas as pd
import scipy.spatial.distance as spd
from pymer4.simulate import easy_multivariate_normal
from pymer4.models import Lm
import matplotlib.pyplot as plt
% matplotlib inline

In [93]:
# Prep some data 
X = easy_multivariate_normal(50,2,corrs=.2)
a, b = X[:,0], X[:,1]

## Inner product

In [94]:
np.dot(a,b)

-2.8071010155482368

## Covariance  
Average centered inner product

In [95]:
a_centered = a - a.mean()
b_centered = b - b.mean()

np.dot(a_centered,b_centered) / len(a) # could have used len(b) instead

-0.06920945007922867

In [96]:
# Check our work
np.cov(a,b,ddof=0)[0][1]

-0.06920945007922867

## Cosine Similarity  
Normalized (L2) inner product

In [97]:
# Euclidean/L2 norm = square root of sum of squared values
# algebra form
a_norm = np.sqrt(np.sum(np.power(a,2)))
# matrix form
b_norm = np.sqrt(np.dot(b,b.T)) 
# numpy short-cut
# np.linalg.norm(a)
np.dot(a,b) / (a_norm * b_norm)

-0.049727707335311774

In [98]:
# Check our work (subract 1 because scipy returns distances)
1 - spd.cosine(a,b)

-0.04972770733531173

## Pearson Correlation  
Normalized (L2) centered inner product

In [99]:
# Can think of this as normalized covariance OR centered cosine similarity
a_centered_norm = np.linalg.norm(a_centered)
b_centered_norm = np.linalg.norm(b_centered)
np.dot(a_centered,b_centered) / (a_centered_norm * b_centered_norm)

-0.06299972596364047

In [100]:
# Check our work
1 - spd.correlation(a,b)

-0.06299972596364034

## OLS (univariate w/o intercept)  
Partially normalized inner product, where partially means applied to only one vector

In [101]:
# Can think of this as cosine similarity using only one vector
np.dot(a,b) / (a_norm * a_norm)

-0.06164964567554537

In [103]:
# Check our work
model = Lm('B ~ 0 + A',data=pd.DataFrame({'A':a,'B':b}))
model.fit(summarize=False)
model.coefs.iloc[-1,0]

-0.06164964567554537

## OLS (univariate w/ intercept)  
Centered, partially normalized inner product, where partially means applied to only one vector


In [108]:
# In the numerator we could actually center a or b, or both.
np.dot(a_centered,b) / (a_centered_norm * a_centered_norm)

-0.07620054357106859

In [105]:
# Check our work
model = Lm('B ~ A',data=pd.DataFrame({'A':a,'B':b}))
model.fit(summarize=False)
model.coefs.iloc[-1,0]

-0.07620054357106855