Skip to content

Commit

Permalink
added Ozbay metric
Browse files Browse the repository at this point in the history
  • Loading branch information
chrislit committed Dec 9, 2018
1 parent bb63a25 commit 0b063b2
Show file tree
Hide file tree
Showing 4 changed files with 295 additions and 0 deletions.
3 changes: 3 additions & 0 deletions abydos/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
- Sift4 distance (:py:class:`.Sift4` and :py:class:`.Sift4Simplest`)
- Typo distance (:py:class:`.Typo`)
- Synoname (:py:class:`.Synoname`)
- Ozbay metric (:py:class:`.Ozbay`)
Most of the distance and similarity measures have ``sim`` and ``dist`` methods,
which return a measure that is normalized to the range :math:`[0, 1]`. The
Expand Down Expand Up @@ -164,6 +165,7 @@
from ._ncd_zlib import NCDzlib, dist_ncd_zlib, sim_ncd_zlib
from ._needleman_wunsch import NeedlemanWunsch, needleman_wunsch
from ._overlap import Overlap, dist_overlap, sim_overlap
from ._ozbay import Ozbay
from ._prefix import Prefix, dist_prefix, sim_prefix
from ._ratcliff_obershelp import (
RatcliffObershelp,
Expand Down Expand Up @@ -318,6 +320,7 @@
'sim_typo',
'Synoname',
'synoname',
'Ozbay',
]


Expand Down
143 changes: 143 additions & 0 deletions abydos/distance/_ozbay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._ozbay.
Ozbay metric
"""

from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)

from . import Jaccard, LCSstr, Levenshtein
from ._distance import _Distance
from ..tokenizer import QGrams

__all__ = ['Ozbay']


class Ozbay(_Distance):
"""Ozbay metric.
The Ozbay metric :cite:`Ozbay:2015` is a string distance measure developed
by Hakan Ozbay, which combines Jaccard distance, Levenshtein distance, and
longest common substring distance.
The normalized variant should be considered experimental.
.. versionadded:: 0.4.0
"""

_lev = Levenshtein()
_jac = Jaccard(tokenizer=QGrams(qval=1, start_stop='', scaler='set'))
_lcs = LCSstr()

def dist_abs(self, src, tar):
"""Return the Ozbay metric.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Ozbay metric
Examples
--------
>>> cmp = Ozbay()
>>> round(cmp.dist_abs('cat', 'hat'), 12)
0.75
>>> round(cmp.dist_abs('Niall', 'Neil'), 12)
6.0
>>> round(cmp.dist_abs('Colin', 'Cuilen'), 12)
7.714285714286
>>> cmp.dist_abs('ATCG', 'TAGC')
3.0
.. versionadded:: 0.4.0
"""
lev_dist = self._lev.dist_abs(src, tar)
lev_metric = 0.0 if lev_dist == 0 else lev_dist / len(src)
jac_metric = self._jac.dist_abs(src, tar)
lcs_metric = self._lcs.sim(src, tar)

if jac_metric == 1.0:
ozbay_metric = lev_dist
elif jac_metric == 0.0:
ozbay_metric = lev_metric
else:
ozbay_metric = jac_metric * lev_dist

if lcs_metric > 0.0:
ozbay_metric /= lcs_metric
else:
ozbay_metric *= min(len(src), len(tar))

return ozbay_metric

def dist(self, src, tar):
"""Return the normalized Ozbay distance.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Ozbay distance
Examples
--------
>>> cmp = Ozbay()
>>> round(cmp.dist('cat', 'hat'), 12)
0.027777777778
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.24
>>> round(cmp.dist('Colin', 'Cuilen'), 12)
0.214285714286
>>> cmp.dist('ATCG', 'TAGC')
0.140625
.. versionadded:: 0.4.0
"""
dist = self.dist_abs(src, tar)
if dist:
return dist / (len(src) * len(tar) / self._lcs.dist(src, tar))
return dist


if __name__ == '__main__':
import doctest

doctest.testmod()
9 changes: 9 additions & 0 deletions docs/abydos.bib
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,15 @@ @Article{Otsuka:1936
Volume = {6}
}

@Misc{Ozbay:2015,
Title = {Ozbay metric},

Author = {Ozbay, Hakan},
Year = {2015},

Url = {https://github.com/hakanozbay/ozbay-metric}
}

@InProceedings{Paice:1990,
Title = {Another stemmer},
Author = {Paice, {Chris D.}},
Expand Down
140 changes: 140 additions & 0 deletions tests/distance/test_distance_ozbay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tests.distance.test_distance_ozbay
This module contains unit tests for abydos.distance.Ozbay
"""

from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)

import unittest

from abydos.distance import Ozbay


class OzbayTestCases(unittest.TestCase):
"""Test Ozbay metric functions.
abydos.distance.Ozbay
"""

cmp = Ozbay()

def test_ozbay_dist_abs(self):
"""Test abydos.distance.Ozbay.dist_abs."""
self.assertEqual(self.cmp.dist_abs('', ''), 0.0)

self.assertAlmostEqual(
self.cmp.dist_abs('piccadilly', 'bandage'), 73.63636363636363
)
self.assertAlmostEqual(self.cmp.dist_abs('abcd', 'efgh'), 16)

# Test cases from https://github.com/hakanozbay/ozbay-metric
self.assertEqual(self.cmp.dist_abs('ban', 'ban'), 0.0)
self.assertAlmostEqual(self.cmp.dist_abs('ban', 'bane'), 0.3333333333)
self.assertAlmostEqual(self.cmp.dist_abs('ban', 'band'), 0.3333333333)
self.assertEqual(self.cmp.dist_abs('ban', 'bat'), 0.75)
self.assertAlmostEqual(self.cmp.dist_abs('ban', 'bands'), 1.3333333333)
self.assertEqual(self.cmp.dist_abs('ban', 'banana'), 2.0)
self.assertAlmostEqual(
self.cmp.dist_abs('ban', 'bandana'), 2.3333333333
)
self.assertEqual(self.cmp.dist_abs('ban', 'bandit'), 3.0)
self.assertAlmostEqual(
self.cmp.dist_abs('ban', 'bandage'), 4.6666666666
)

self.assertEqual(self.cmp.dist_abs('piccadilly', 'piccadilly'), 0.0)
self.assertEqual(self.cmp.dist_abs('piccadilly', 'piccadilyl'), 0.25)
self.assertAlmostEqual(
self.cmp.dist_abs('piccadilly', 'piccadlily'), 0.3333333333
)
self.assertEqual(self.cmp.dist_abs('piccadilly', 'picacdilly'), 0.4)
self.assertEqual(self.cmp.dist_abs('piccadilly', 'picadily'), 0.4)
self.assertEqual(self.cmp.dist_abs('picadily', 'piccadilly'), 0.5)
self.assertAlmostEqual(
self.cmp.dist_abs('piccadilly', 'picacdlily'), 1.3333333333
)
self.assertAlmostEqual(
self.cmp.dist_abs('ipcacdily', 'piccadilly'), 1.4814814814814814
)
self.assertAlmostEqual(
self.cmp.dist_abs('piccadilly', 'ipcacdily'), 1.333333333
)
self.assertEqual(self.cmp.dist_abs('piccadilly', 'pcicadlyil'), 2.0)

def test_ozbay_dist(self):
"""Test abydos.distance.Ozbay.dist."""
self.assertEqual(self.cmp.dist('', ''), 0)

self.assertAlmostEqual(
self.cmp.dist('piccadilly', 'bandage'), 0.9467532467532467
)
self.assertAlmostEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

# Test cases from https://github.com/hakanozbay/ozbay-metric
self.assertEqual(self.cmp.dist('ban', 'ban'), 0.0)
self.assertAlmostEqual(
self.cmp.dist('ban', 'bane'), 0.006944444444444444
)
self.assertAlmostEqual(
self.cmp.dist('ban', 'band'), 0.006944444444444444
)
self.assertEqual(self.cmp.dist('ban', 'bat'), 0.02777777777777778)
self.assertAlmostEqual(
self.cmp.dist('ban', 'bands'), 0.03555555555555556
)
self.assertEqual(self.cmp.dist('ban', 'banana'), 0.05555555555555555)
self.assertAlmostEqual(
self.cmp.dist('ban', 'bandana'), 0.0634920634920635
)
self.assertEqual(self.cmp.dist('ban', 'bandit'), 0.08333333333333333)
self.assertAlmostEqual(
self.cmp.dist('ban', 'bandage'), 0.126984126984127
)

self.assertEqual(self.cmp.dist('piccadilly', 'piccadilly'), 0.0)
self.assertEqual(
self.cmp.dist('piccadilly', 'piccadilyl'), 0.0004999999999999999
)
self.assertAlmostEqual(
self.cmp.dist('piccadilly', 'piccadlily'), 0.0013333333333333335
)
self.assertEqual(self.cmp.dist('piccadilly', 'picacdilly'), 0.002)
self.assertEqual(self.cmp.dist('piccadilly', 'picadily'), 0.0025)
self.assertEqual(self.cmp.dist('picadily', 'piccadilly'), 0.003125)
self.assertAlmostEqual(
self.cmp.dist('piccadilly', 'picacdlily'), 0.009333333333333334
)
self.assertAlmostEqual(
self.cmp.dist('ipcacdily', 'piccadilly'), 0.011522633744855966
)
self.assertAlmostEqual(
self.cmp.dist('piccadilly', 'ipcacdily'), 0.01037037037037037
)
self.assertEqual(self.cmp.dist('piccadilly', 'pcicadlyil'), 0.014)


if __name__ == '__main__':
unittest.main()

0 comments on commit 0b063b2

Please sign in to comment.