Skip to content

Commit

Permalink
release: v1.0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Sep 30, 2018
2 parents bcdbb72 + 3b76ea8 commit d4e302f
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 87 deletions.
14 changes: 8 additions & 6 deletions src/cophi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
**cophi** is a Python library for handling, modeling and processing text corpora. You
can easily pipe a collection of text files using the high-level API:
r"""
**cophi** is a Python library for handling, modeling and processing text
corpora. You can easily pipe a collection of text files using the
high-level API:
.. code-block:: python
Expand All @@ -12,7 +13,8 @@
token_pattern=r"\p{L}+\p{P}?\p{L}+")
There are also a plenty of complexity metrics for measuring lexical richness of (literary) texts.
There are also a plenty of complexity metrics for measuring lexical
richness of (literary) texts.
Measures that use sample size and vocabulary size:
* Type-Token Ratio :math:`TTR`
Expand Down Expand Up @@ -40,8 +42,8 @@
Parameters of probabilistic models:
* Orlov’s :math:`Z`
For a more detailed description and the used formulas, have a look at the :module:`complexity` module.
For a more detailed description and the used formulas, have a look at the
:module:`complexity` module.
"""


from cophi.api import document, corpus
2 changes: 1 addition & 1 deletion src/cophi/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = (1, 0, 5)
VERSION = (1, 0, 6)

__version__ = ".".join(map(str, VERSION))
81 changes: 50 additions & 31 deletions src/cophi/complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
cophi.complexity
~~~~~~~~~~~~~~~~
This module implements measures that assess the linguistic and stylistic
complexity of (literary) texts.
This module implements measures that assess the linguistic
and stylistic complexity of (literary) texts.
:math:`N` is the Absolute number of tokens, and :math:`V` the Absolute number of types.
:math:`H` is the Absolute number of types occuring only once (hapax legomena),
:math:`D` is the Absolute number of types occuring twice (dislegomena).
:math:`N` is the Absolute number of tokens, and :math:`V` the
Absolute number of types. :math:`H` is the Absolute number of
types occuring only once (hapax legomena), :math:`D` is the
absolute number of types occuring twice (dislegomena).
This module was taken from `here <https://github.com/tsproisl/Linguistic_and_Stylistic_Complexity>`_.
This module was taken from
`here <https://github.com/tsproisl/Linguistic_and_Stylistic_Complexity>`_.
"""

import math
Expand All @@ -19,7 +21,7 @@
# use num_types + num_tokens (int):

def ttr(num_types, num_tokens):
"""Calculate Type-Token Ratio (TTR).
r"""Calculate Type-Token Ratio (TTR).
Used formula:
.. math::
Expand All @@ -33,7 +35,7 @@ def ttr(num_types, num_tokens):


def guiraud_r(num_types, num_tokens):
"""Calculate Guiraud’s R (1954).
r"""Calculate Guiraud’s R (1954).
Used formula:
.. math::
Expand All @@ -42,12 +44,12 @@ def guiraud_r(num_types, num_tokens):
Parameters:
num_types (int): Absolute number of types.
num_tokens (int): Absolute number of tokens.
"""
"""
return num_types / math.sqrt(num_tokens)


def herdan_c(num_types, num_tokens):
"""Calculate Herdan’s C (1960, 1964).
r"""Calculate Herdan’s C (1960, 1964).
Used formula:
.. math::
Expand All @@ -61,7 +63,7 @@ def herdan_c(num_types, num_tokens):


def dugast_k(num_types, num_tokens):
"""Calculate Dugast’s k (1979).
r"""Calculate Dugast’s k (1979).
Used formula:
.. math::
Expand All @@ -75,7 +77,7 @@ def dugast_k(num_types, num_tokens):


def maas_a2(num_types, num_tokens):
"""Calculate Maas’ a^2 (1972).
r"""Calculate Maas’ a^2 (1972).
Used formula:
.. math::
Expand All @@ -85,11 +87,12 @@ def maas_a2(num_types, num_tokens):
num_types (int): Absolute number of types.
num_tokens (int): Absolute number of tokens.
"""
return (math.log(num_tokens) - math.log(num_types)) / (math.log(num_tokens) ** 2)
return (math.log(num_tokens)
- math.log(num_types)) / (math.log(num_tokens) ** 2)


def dugast_u(num_types, num_tokens):
"""Calculate Dugast’s U (1978, 1979).
r"""Calculate Dugast’s U (1978, 1979).
Used formula:
.. math::
Expand All @@ -99,11 +102,12 @@ def dugast_u(num_types, num_tokens):
num_types (int): Absolute number of types.
num_tokens (int): Absolute number of tokens.
"""
return (math.log(num_tokens) ** 2) / (math.log(num_tokens) - math.log(num_types))
return (math.log(num_tokens) ** 2) / (math.log(num_tokens)
- math.log(num_types))


def tuldava_ln(num_types, num_tokens):
"""Calculate Tuldava’s LN (1977).
r"""Calculate Tuldava’s LN (1977).
Used formula:
.. math::
Expand All @@ -117,7 +121,7 @@ def tuldava_ln(num_types, num_tokens):


def brunet_w(num_types, num_tokens):
"""Calculate Brunet’s W (1978).
r"""Calculate Brunet’s W (1978).
Used formula:
.. math::
Expand All @@ -132,7 +136,7 @@ def brunet_w(num_types, num_tokens):


def cttr(num_types, num_tokens):
"""Calculate Carroll’s Corrected Type-Token Ration (CTTR) (1964).
r"""Calculate Carroll’s Corrected Type-Token Ration (CTTR) (1964).
Used formula:
.. math::
Expand All @@ -146,7 +150,7 @@ def cttr(num_types, num_tokens):


def summer_s(num_types, num_tokens):
"""Calculate Summer’s S.
r"""Calculate Summer’s S.
Used formula:
.. math::
Expand All @@ -162,7 +166,7 @@ def summer_s(num_types, num_tokens):
# use num_types + part of freq_spectrum:

def sichel_s(num_types, freq_spectrum):
"""Calculate Sichel’s S (1975).
r"""Calculate Sichel’s S (1975).
Used formula:
.. math::
Expand All @@ -176,7 +180,7 @@ def sichel_s(num_types, freq_spectrum):


def michea_m(num_types, freq_spectrum):
"""Calculate Michéa’s M (1969, 1971).
r"""Calculate Michéa’s M (1969, 1971).
Used formula:
.. math::
Expand All @@ -190,7 +194,7 @@ def michea_m(num_types, freq_spectrum):


def honore_h(num_types, num_tokens, freq_spectrum):
"""Calculate Honoré’s H (1979).
r"""Calculate Honoré’s H (1979).
Used formula:
.. math::
Expand All @@ -200,13 +204,14 @@ def honore_h(num_types, num_tokens, freq_spectrum):
num_types (int): Absolute number of types.
freq_spectrum (dict): Counted occurring frequencies.
"""
return 100 * (math.log(num_tokens) / (1 - ((freq_spectrum[1]) / (num_types))))
return 100 * (math.log(num_tokens)
/ (1 - ((freq_spectrum[1]) / (num_types))))


# use num_tokens + freq_spectrum:

def entropy(num_tokens, freq_spectrum):
"""Calculate entropy S.
r"""Calculate entropy S.
Parameters:
num_tokens (int): Absolute number of tokens.
Expand All @@ -219,7 +224,7 @@ def entropy(num_tokens, freq_spectrum):


def yule_k(num_tokens, freq_spectrum):
"""Calculate Yule’s K (1944).
r"""Calculate Yule’s K (1944).
Used formula:
.. math::
Expand Down Expand Up @@ -259,10 +264,10 @@ def herdan_vm(num_types, num_tokens, freq_spectrum):
return math.sqrt((freq_spectrum * a ** 2).sum() - b)



# use probabilistic models:

def orlov_z(num_tokens, num_types, freq_spectrum, max_iterations=100, min_tolerance=1):
def orlov_z(num_tokens, num_types, freq_spectrum,
max_iterations=100, min_tolerance=1):
"""Calculate Orlov’s Z (1983), approximated via Newton’s method.
Parameters:
Expand All @@ -274,22 +279,36 @@ def orlov_z(num_tokens, num_types, freq_spectrum, max_iterations=100, min_tolera
p_star = most_frequent / num_tokens
z = num_tokens / 2
for i in range(max_iterations):
next_z = z - (_get_z(num_tokens, num_types, p_star, z) / _derivative(num_tokens, num_types, p_star, z))
next_z = z - (_get_z(num_tokens,
num_types,
p_star, z) / _derivative(num_tokens,
num_types,
p_star,
z))
abs_diff = abs(z - next_z)
z = next_z
if abs_diff <= min_tolerance:
break
return z


def _get_z(num_tokens, num_types, p_star, z):
"""Private function for :func:`orlov_z`.
"""
return (z / math.log(p_star * z)) * (num_tokens / (num_tokens - z)) * math.log(num_tokens / z) - num_types
return (((z / math.log(p_star * z))
* (num_tokens / (num_tokens - z))
* math.log(num_tokens / z) - num_types))


def _derivative(num_tokens, num_types, p_star, z):
"""Private function for :func:`orlov_z`.
"""
return (num_tokens * ((z - num_tokens) * math.log(p_star * z) + math.log(num_tokens / z) * (num_tokens * math.log(p_star * z) - num_tokens + z))) / (((num_tokens - z) ** 2) * (math.log(p_star * z) ** 2))
return ((num_tokens
* ((z - num_tokens)
* math.log(p_star * z)
+ math.log(num_tokens / z)
* (num_tokens * math.log(p_star * z) - num_tokens + z)))
/ (((num_tokens - z) ** 2) * (math.log(p_star * z) ** 2)))


# other:
Expand Down Expand Up @@ -324,4 +343,4 @@ def wrapper(measure):
"simpson_d": simpson_d,
"herdan_vm": herdan_vm,
"orlov_z": orlov_z}
return measures[measure]
return measures[measure]

0 comments on commit d4e302f

Please sign in to comment.