Added cosine_similarity

guiem · Mar 13, 2018 · 0f848f4 · 0f848f4
1 parent 958e6ad
commit 0f848f4
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 6 deletions.
diff --git a/README.rst b/README.rst
@@ -15,4 +15,5 @@ Installation
 Index of utilities
 ------------------
 - :code:`dyn_mean`: computes the mean based on a previous mean plus a new value. Useful when mean is built incrementally, it saves the usage of huge arrays.
-- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value.
+- :code:`dyn_stdev`: computes the stdev based on a previous stdev plus a new value.
+- :code:`cosine_similarity`: computes the cosine similarity between two vectors or matrix and vector.
diff --git a/gputils.py b/gputils.py
@@ -1,4 +1,7 @@
 import math
+import pandas as pd
+import numpy as np
+
 
 def dyn_mean(val, prev_mean, n):
     """Dynamic mean: computes the mean based on a previous mean plus a new value. Useful when mean is built
@@ -30,4 +33,25 @@ def dyn_stdev(val, prev_stdev, prev_mean, n):
     if n == 1:
         return 0
     curr_mean = dyn_mean(val, prev_mean, n)
-    return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n))
+    return math.sqrt(((n-1)*prev_stdev*prev_stdev + (val - prev_mean)*(val - curr_mean)) / float(n))
+
+
+def cosine_similarity(u, v):
+    """Cosine similarity: computes the standard cosine similarity between two vectors or a matrix of vectors and a
+    vector.
+    Returns the cosine similarity between u and v, or list of similarities from every vector in u (if it is a matrix)
+    with regards to v.
+
+    Note: for simplicity the vectors should be pd.Series or u could be a pd.DataFrame if matrix product, you can
+    add compatibility with other data entries if needed, fork and pull request! :)
+
+    Keyword arguments:
+    u -- vector of dimesions 1xn or matrix of dimensions mxn (where m is the number of vectors)
+    v -- vector of dimensions 1xn
+    """
+    dot = u.dot(v.transpose())
+    axis = 0 if len(u.shape) < 2 or u.shape[1] < 2 else 1
+    norm_u = np.linalg.norm(u, axis=axis)
+    norm_v = np.linalg.norm(v)
+    similarity = dot / (norm_u * norm_v)
+    return similarity
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,17 @@
+certifi==2018.1.18
+chardet==3.0.4
 coverage==4.5.1
+coveralls==1.2.0
+docopt==0.6.2
+idna==2.6
 numpy==1.14.1
+pandas==0.22.0
+pkginfo==1.4.1
+python-dateutil==2.7.0
+pytz==2018.3
+requests==2.18.4
+requests-toolbelt==0.8.0
+six==1.11.0
+tqdm==4.19.5
+twine==1.9.1
+urllib3==1.22
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name='gputils',
-    version='1.0.5',
+    version='1.0.6',
     description='Variety of utilities that may come handy in diverse projects. ',
     long_description=long_description,
     url='https://github.com/guiem/gputils',
@@ -27,11 +27,11 @@
     ],
     #packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
     py_modules=["gputils"],
-    install_requires=[],
+    install_requires=['numpy', 'pandas'],
     python_requires='>=3',
     extras_require={
         # 'dev': ['check-manifest'],
-        'test': ['coverage','numpy'],
+        'test': ['coverage'],
     },
     project_urls={
         'Bug Reports': 'https://github.com/guiem/gputils/issues',

diff --git a/tests/tests.py b/tests/tests.py
@@ -1,6 +1,7 @@
 import unittest, time
 import numpy as np
-from gputils import dyn_mean, dyn_stdev
+import pandas as pd
+from gputils import dyn_mean, dyn_stdev, cosine_similarity
 
 
 class TestMethods(unittest.TestCase):
@@ -41,6 +42,23 @@ def test_dyn_stdev(self):
             self.assertAlmostEqual(curr_std, test_stdev, 10)
             self.assertLessEqual(t1_test - t0_test, t1_trusted - t0_trusted) # ensuring we are faster
 
+    def test_cosine_similarity(self):
+        u = pd.Series([1, 2, 3])
+        v = pd.Series([3, 2, 1])
+        similarity = cosine_similarity(u, v)
+        manual = 10 / (np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v)))
+        self.assertEqual(manual, similarity)
+
+        np.random.seed(69)
+        E = pd.DataFrame(np.random.randn(4, 3), index=['house', 'rocket', 'science', 'Trump'])
+        word = 'science'
+        w = E.loc[word]
+        similarity = cosine_similarity(E, w)
+        self.assertAlmostEqual(1.0, similarity['science'], 10)
+        u = E.loc['science']
+        v = E.loc['Trump']
+        manual = np.sum(u * v) / ((np.sqrt(np.sum(u * u)) * np.sqrt(np.sum(v * v))))
+        self.assertAlmostEqual(manual, similarity['Trump'], 5)
 
 if __name__ == '__main__':
     unittest.main()