From 0f848f4e3eeb6e15be8562dbc7f1c01a6a120486 Mon Sep 17 00:00:00 2001 From: guiem Date: Tue, 13 Mar 2018 10:50:27 +0000 Subject: [PATCH] Added cosine_similarity --- README.rst | 3 ++- gputils.py | 26 +++++++++++++++++++++++++- requirements.txt | 15 +++++++++++++++ setup.py | 6 +++--- tests/tests.py | 20 +++++++++++++++++++- 5 files changed, 64 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 5afb77f..ef552a7 100644 --- a/README.rst +++ b/README.rst @@ -15,4 +15,5 @@ Installation Index of utilities ------------------ - :code:`dyn_mean`: computes the mean based on a previous mean plus a new value. Useful when mean is built incrementally, it saves the usage of huge arrays. -- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value. \ No newline at end of file +- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value. +- :code:`cosine_similarity`: computes the cosine similarity between two vectors or matrix and vector. \ No newline at end of file diff --git a/gputils.py b/gputils.py index 3300b68..7afab01 100644 --- a/gputils.py +++ b/gputils.py @@ -1,4 +1,7 @@ import math +import pandas as pd +import numpy as np + def dyn_mean(val, prev_mean, n): """Dynamic mean: computes the mean based on a previous mean plus a new value. Useful when mean is built @@ -30,4 +33,25 @@ def dyn_stdev(val, prev_stdev, prev_mean, n): if n == 1: return 0 curr_mean = dyn_mean(val, prev_mean, n) - return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n)) \ No newline at end of file + return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n)) + + +def cosine_similarity(u, v): + """Cosine similarity: computes the standard cosine similarity between two vectors or a matrix of vectors and a + vector. + Returns the cosine similarity between u and v, or list of similarities from every vector in u (if it is a matrix) + with regards to v. + + Note: for simplicity the vectors should be pd.Series or u could be a pd.DataFrame if matrix product, you can + add compatibility with other data entries if needed, fork and pull request! :) + + Keyword arguments: + u -- vector of dimesions 1xn or matrix of dimensions mxn (where m is the number of vectors) + v -- vector of dimensions 1xn + """ + dot = u.dot(v.transpose()) + axis = 0 if len(u.shape) < 2 or u.shape[1] < 2 else 1 + norm_u = np.linalg.norm(u, axis=axis) + norm_v = np.linalg.norm(v) + similarity = dot / (norm_u * norm_v) + return similarity \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 165fab8..78f9cd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,17 @@ +certifi==2018.1.18 +chardet==3.0.4 coverage==4.5.1 +coveralls==1.2.0 +docopt==0.6.2 +idna==2.6 numpy==1.14.1 +pandas==0.22.0 +pkginfo==1.4.1 +python-dateutil==2.7.0 +pytz==2018.3 +requests==2.18.4 +requests-toolbelt==0.8.0 +six==1.11.0 +tqdm==4.19.5 +twine==1.9.1 +urllib3==1.22 diff --git a/setup.py b/setup.py index 4976dd7..986db7f 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='gputils', - version='1.0.5', + version='1.0.6', description='Variety of utilities that may come handy in diverse projects. ', long_description=long_description, url='https://github.com/guiem/gputils', @@ -27,11 +27,11 @@ ], #packages=find_packages(exclude=['contrib', 'docs', 'tests*']), py_modules=["gputils"], - install_requires=[], + install_requires=['numpy', 'pandas'], python_requires='>=3', extras_require={ # 'dev': ['check-manifest'], - 'test': ['coverage','numpy'], + 'test': ['coverage'], }, project_urls={ 'Bug Reports': 'https://github.com/guiem/gputils/issues', diff --git a/tests/tests.py b/tests/tests.py index ac05a94..7b300e4 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,6 +1,7 @@ import unittest, time import numpy as np -from gputils import dyn_mean, dyn_stdev +import pandas as pd +from gputils import dyn_mean, dyn_stdev, cosine_similarity class TestMethods(unittest.TestCase): @@ -41,6 +42,23 @@ def test_dyn_stdev(self): self.assertAlmostEqual(curr_std, test_stdev, 10) self.assertLessEqual(t1_test - t0_test, t1_trusted - t0_trusted) # ensuring we are faster + def test_cosine_similarity(self): + u = pd.Series([1, 2, 3]) + v = pd.Series([3, 2, 1]) + similarity = cosine_similarity(u, v) + manual = 10 / (np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v))) + self.assertEqual(manual, similarity) + + np.random.seed(69) + E = pd.DataFrame(np.random.randn(4, 3), index=['house', 'rocket', 'science', 'Trump']) + word = 'science' + w = E.loc[word] + similarity = cosine_similarity(E, w) + self.assertAlmostEqual(1.0, similarity['science'], 10) + u = E.loc['science'] + v = E.loc['Trump'] + manual = np.sum(u * v) / ((np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v)))) + self.assertAlmostEqual(manual, similarity['Trump'], 5) if __name__ == '__main__': unittest.main()