Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add type hints #49

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<p align="center">
<img style="width: 50%; height: 50%" src="https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png">
<img style="width: 30%; height: 30%" src="https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png">
</p>

# TRUNAJOD: A text complexity library for text analysis built on spaCy
Expand Down
2 changes: 1 addition & 1 deletion docs/api_reference/ttr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ Type Token Ratios
.. automodule:: TRUNAJOD.ttr
:members:

.. bibliography:: ttr.bib
.. bibliography:: ttr.bib
6 changes: 6 additions & 0 deletions docs/api_reference/ttr_ref.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@misc{herdan1961problemes,
title={Probl{\`e}mes et m{\'e}thodes de la statistique linguistique},
author={Herdan, Gustav},
year={1961},
publisher={JSTOR}
}
Binary file modified imgs/trunajod_logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 14 additions & 8 deletions src/TRUNAJOD/lexico_semantic_norms.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
We provide two downloadable models of these variables, which come from
:cite:`duchon2013espal` and :cite:`guasch2016spanish`.
"""
from spacy.tokens import Doc
from TRUNAJOD.lexicosemantic_norms_espal import LEXICOSEMANTIC_ESPAL
from TRUNAJOD.lexicosemantic_norms_espal import LSNorm
from TRUNAJOD.utils import lemmatize
Expand All @@ -28,7 +29,12 @@ class LexicoSemanticNorm(object):
:cite:`guasch2016spanish`.
"""

def __init__(self, doc, lexico_semantic_norm_dict, lemmatizer=None):
def __init__(
self,
doc: Doc,
lexico_semantic_norm_dict: dict,
lemmatizer: dict = None,
):
"""Initialize lexico semantic norm object.

Calculate average over number of tokens given a text.
Expand Down Expand Up @@ -102,47 +108,47 @@ def __init__(self, doc, lexico_semantic_norm_dict, lemmatizer=None):
self.__context_avilability /= count
self.__familiarity /= count

def get_arousal(self):
def get_arousal(self) -> float:
"""Get arousal.

:return: Average arousal.
:rtype: float
"""
return self.__arousal

def get_concreteness(self):
def get_concreteness(self) -> float:
"""Get concreteness.

:return: Average concreteness.
:rtype: float
"""
return self.__concreteness

def get_context_availability(self):
def get_context_availability(self) -> float:
"""Get context_availability.

:return: Average context_availability.
:rtype: float
"""
return self.__context_avilability

def get_familiarity(self):
def get_familiarity(self) -> float:
"""Get familiarity.

:return: Average familiarity.
:rtype: float
"""
return self.__familiarity

def get_imageability(self):
def get_imageability(self) -> float:
"""Get imageability.

:return: Average imageability.
:rtype: float
"""
return self.__imageability

def get_valence(self):
def get_valence(self) -> float:
"""Get valence.

:return: Average valence.
Expand All @@ -151,7 +157,7 @@ def get_valence(self):
return self.__valence


def get_conc_imag_familiarity(doc):
def get_conc_imag_familiarity(doc: Doc) -> [float, float, float]:
"""Get lexico-semantic variables.

Computes three lexico-semantic variables: Concreteness, Imageability and
Expand Down
7 changes: 4 additions & 3 deletions src/TRUNAJOD/semantic_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
semantic measurements require word vectors (word embeddings) obtained from
CORPUS semantics.
"""
from spacy.tokens import Doc


def avg_w2v_semantic_similarity(docs, N):
def avg_w2v_semantic_similarity(docs: Doc, N: int) -> float:
"""Compute average semantic similarity between adjacent sentences.

This is using word2vec :cite:`mikolov2013word2vec` model based on SPACY
Expand Down Expand Up @@ -43,7 +44,7 @@ def avg_w2v_semantic_similarity(docs, N):
return avg_sim / float(N - 1)


def get_synsets(lemma, synset_dict):
def get_synsets(lemma: str, synset_dict: dict) -> str:
"""Return synonym set given a word lemma.

The function requires that the synset_dict is passed into it. In our case
Expand All @@ -61,7 +62,7 @@ def get_synsets(lemma, synset_dict):
return synset_dict.get(lemma, {lemma})


def overlap(lemma_list_group, synset_dict):
def overlap(lemma_list_group: list, synset_dict: dict) -> float:
"""Compute average overlap in a text.

Computes semantic synset overlap (synonyms), based on a lemma list group
Expand Down
34 changes: 20 additions & 14 deletions src/TRUNAJOD/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"""Utility functions for TRUNAJOD library."""
from enum import Enum

from spacy.tokens import Doc


class SupportedModels(str, Enum):
"""Enum for supported Doc models."""
Expand All @@ -10,7 +12,7 @@ class SupportedModels(str, Enum):
STANZA = "stanza"


def flatten(list_of_lists):
def flatten(list_of_lists: {}) -> {}:
"""Flatten a list of list.

This is a utility function that takes a list of lists and
Expand All @@ -25,7 +27,9 @@ def flatten(list_of_lists):
return [item for sublist in list_of_lists for item in sublist]


def get_sentences_lemmas(docs, lemma_dict, stopwords=[]): # pragma: no cover
def get_sentences_lemmas(
docs: Doc, lemma_dict: dict, stopwords=[]
) -> {}: # pragma: no cover
"""Get lemmas from sentences.

Get different types of lemma measurements, such as noun lemmas, verb
Expand Down Expand Up @@ -88,7 +92,7 @@ def get_sentences_lemmas(docs, lemma_dict, stopwords=[]): # pragma: no cover
)


def get_stopwords(filename):
def get_stopwords(filename: str) -> set:
"""Read stopword list from file.

Assumes that the list is defined as a newline separated words. It is
Expand All @@ -109,7 +113,9 @@ def get_stopwords(filename):
return stopwords


def get_token_lemmas(doc, lemma_dict, stopwords=[]): # pragma: no cover
def get_token_lemmas(
doc: Doc, lemma_dict: dict, stopwords=[]
) -> {}: # pragma: no cover
"""Return lemmas from a sentence.

From a sentence, extracts the following lemmas:
Expand Down Expand Up @@ -170,7 +176,7 @@ def get_token_lemmas(doc, lemma_dict, stopwords=[]): # pragma: no cover
)


def is_adjective(pos_tag):
def is_adjective(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is ``ADJ``, False otherwise.

:param pos_tag: Part of Speech tag
Expand All @@ -181,7 +187,7 @@ def is_adjective(pos_tag):
return pos_tag == "ADJ"


def is_adverb(pos_tag):
def is_adverb(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is ``ADV``, False otherwise.

:param pos_tag: Part of Speech tag
Expand All @@ -192,7 +198,7 @@ def is_adverb(pos_tag):
return pos_tag == "ADV"


def is_noun(pos_tag):
def is_noun(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is ``NOUN`` or ``PROPN``, False otherwise.

:param pos_tag: Part of Speech tag
Expand All @@ -203,7 +209,7 @@ def is_noun(pos_tag):
return pos_tag == "PROPN" or pos_tag == "NOUN"


def is_pronoun(pos_tag):
def is_pronoun(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is ``PRON``, False otherwise.

:param pos_tag: Part of Speech tag
Expand All @@ -214,7 +220,7 @@ def is_pronoun(pos_tag):
return pos_tag == "PRON"


def is_stopword(word, stopwords):
def is_stopword(word: str, stopwords: {}) -> bool:
"""Return ``True`` if ``word`` is in ``stopwords``, False otherwise.

:param word: Word to be checked
Expand All @@ -227,7 +233,7 @@ def is_stopword(word, stopwords):
return word in stopwords


def is_verb(pos_tag):
def is_verb(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is ``VERB``, False otherwise.

:param pos_tag: Part of Speech tag
Expand All @@ -238,7 +244,7 @@ def is_verb(pos_tag):
return pos_tag == "VERB"


def is_word(pos_tag):
def is_word(pos_tag: str) -> bool:
"""Return ``True`` if ``pos_tag`` is not punctuation, False otherwise.

This method checks that the ``pos_tag`` does not belong to the following
Expand All @@ -252,7 +258,7 @@ def is_word(pos_tag):
return pos_tag != "PUNCT" and pos_tag != "SYM" and pos_tag != "SPACE"


def lemmatize(lemma_dict, word):
def lemmatize(lemma_dict: {}, word: str) -> str:
"""Lemmatize a word.

Lemmatizes a word using a lemmatizer which is represented as a dict that
Expand All @@ -272,7 +278,7 @@ def lemmatize(lemma_dict, word):
return lemma_dict.get(word, word)


def process_text(text, sent_tokenize):
def process_text(text: str, sent_tokenize) -> {}:
"""Process text by tokenizing sentences given a tokenizer.

:param text: Text to be processed
Expand All @@ -285,7 +291,7 @@ def process_text(text, sent_tokenize):
return sent_tokenize(text)


def read_text(filename):
def read_text(filename: str) -> str:
"""Read a ``utf-8`` encoded text file and returns the text as ``string``.

This is just a utily function, that is not recommended to use if the text
Expand Down