Skip to content

Commit

Permalink
Added cosine_similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
guiem committed Mar 13, 2018
1 parent 958e6ad commit 0f848f4
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 6 deletions.
3 changes: 2 additions & 1 deletion README.rst
Expand Up @@ -15,4 +15,5 @@ Installation
Index of utilities
------------------
- :code:`dyn_mean`: computes the mean based on a previous mean plus a new value. Useful when mean is built incrementally, it saves the usage of huge arrays.
- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value.
- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value.
- :code:`cosine_similarity`: computes the cosine similarity between two vectors or matrix and vector.
26 changes: 25 additions & 1 deletion gputils.py
@@ -1,4 +1,7 @@
import math
import pandas as pd
import numpy as np


def dyn_mean(val, prev_mean, n):
"""Dynamic mean: computes the mean based on a previous mean plus a new value. Useful when mean is built
Expand Down Expand Up @@ -30,4 +33,25 @@ def dyn_stdev(val, prev_stdev, prev_mean, n):
if n == 1:
return 0
curr_mean = dyn_mean(val, prev_mean, n)
return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n))
return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n))


def cosine_similarity(u, v):
"""Cosine similarity: computes the standard cosine similarity between two vectors or a matrix of vectors and a
vector.
Returns the cosine similarity between u and v, or list of similarities from every vector in u (if it is a matrix)
with regards to v.
Note: for simplicity the vectors should be pd.Series or u could be a pd.DataFrame if matrix product, you can
add compatibility with other data entries if needed, fork and pull request! :)
Keyword arguments:
u -- vector of dimesions 1xn or matrix of dimensions mxn (where m is the number of vectors)
v -- vector of dimensions 1xn
"""
dot = u.dot(v.transpose())
axis = 0 if len(u.shape) < 2 or u.shape[1] < 2 else 1
norm_u = np.linalg.norm(u, axis=axis)
norm_v = np.linalg.norm(v)
similarity = dot / (norm_u * norm_v)
return similarity
15 changes: 15 additions & 0 deletions requirements.txt
@@ -1,2 +1,17 @@
certifi==2018.1.18
chardet==3.0.4
coverage==4.5.1
coveralls==1.2.0
docopt==0.6.2
idna==2.6
numpy==1.14.1
pandas==0.22.0
pkginfo==1.4.1
python-dateutil==2.7.0
pytz==2018.3
requests==2.18.4
requests-toolbelt==0.8.0
six==1.11.0
tqdm==4.19.5
twine==1.9.1
urllib3==1.22
6 changes: 3 additions & 3 deletions setup.py
Expand Up @@ -12,7 +12,7 @@

setup(
name='gputils',
version='1.0.5',
version='1.0.6',
description='Variety of utilities that may come handy in diverse projects. ',
long_description=long_description,
url='https://github.com/guiem/gputils',
Expand All @@ -27,11 +27,11 @@
],
#packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
py_modules=["gputils"],
install_requires=[],
install_requires=['numpy', 'pandas'],
python_requires='>=3',
extras_require={
# 'dev': ['check-manifest'],
'test': ['coverage','numpy'],
'test': ['coverage'],
},
project_urls={
'Bug Reports': 'https://github.com/guiem/gputils/issues',
Expand Down
20 changes: 19 additions & 1 deletion tests/tests.py
@@ -1,6 +1,7 @@
import unittest, time
import numpy as np
from gputils import dyn_mean, dyn_stdev
import pandas as pd
from gputils import dyn_mean, dyn_stdev, cosine_similarity


class TestMethods(unittest.TestCase):
Expand Down Expand Up @@ -41,6 +42,23 @@ def test_dyn_stdev(self):
self.assertAlmostEqual(curr_std, test_stdev, 10)
self.assertLessEqual(t1_test - t0_test, t1_trusted - t0_trusted) # ensuring we are faster

def test_cosine_similarity(self):
u = pd.Series([1, 2, 3])
v = pd.Series([3, 2, 1])
similarity = cosine_similarity(u, v)
manual = 10 / (np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v)))
self.assertEqual(manual, similarity)

np.random.seed(69)
E = pd.DataFrame(np.random.randn(4, 3), index=['house', 'rocket', 'science', 'Trump'])
word = 'science'
w = E.loc[word]
similarity = cosine_similarity(E, w)
self.assertAlmostEqual(1.0, similarity['science'], 10)
u = E.loc['science']
v = E.loc['Trump']
manual = np.sum(u * v) / ((np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v))))
self.assertAlmostEqual(manual, similarity['Trump'], 5)

if __name__ == '__main__':
unittest.main()

0 comments on commit 0f848f4

Please sign in to comment.