-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Middle High German lemmatizer (#947)
Merging because the outstanding requests appear to have been fulfilled. We'll PR patch if something was missed. * Initiated lemmatization backoff for Middle High German * Added corpus * Added corpus and new lemmatizer * Added lemmatizer test * Removed useless import * Removed useless imports and comments * Removed useless import * Added corpus import in setUp * Updated docs for Middle High German lemmatizer * Removed training part from sentences which is not used * Removed useless statements * Fixed docs Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
- Loading branch information
1 parent
cd920c5
commit e7ba658
Showing
6 changed files
with
95 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
MIDDLE_HIGH_GERMAN_CORPORA = [{ | ||
'name': 'middle_high_german_models_cltk', | ||
'origin': 'https://github.com/cltk/middle_high_german_models_cltk.git', | ||
'location': 'remote', | ||
'type': 'model' | ||
}] | ||
|
||
MIDDLE_HIGH_GERMAN_CORPORA = [ | ||
{ | ||
'name': 'middle_high_german_models_cltk', | ||
'origin': 'https://github.com/cltk/middle_high_german_models_cltk.git', | ||
'location': 'remote', | ||
'type': 'model' | ||
}, | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Module for lemmatizing Middle High German | ||
""" | ||
|
||
__author__ = ['Clément Besnier <clemsciences@aol.com>', ] | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
import os | ||
from typing import List | ||
|
||
from cltk.lemmatize.backoff import IdentityLemmatizer, DictLemmatizer | ||
|
||
from cltk.utils.file_operations import open_pickle | ||
|
||
|
||
class BackoffMHGLemmatizer: | ||
"""Suggested backoff chain; includes at least on of each | ||
type of major sequential backoff class from backoff.py | ||
""" | ||
|
||
models_path = os.path.normpath(get_cltk_data_dir() + | ||
'/middle_high_german/model/middle_high_german_models_cltk/lemmata/backoff') | ||
|
||
def __init__(self, seed: int = 3, verbose: bool = False): | ||
self.models_path = BackoffMHGLemmatizer.models_path | ||
|
||
missing_models_message = "BackoffMHGLemmatizer requires the ```middle_high_german_models_cltk``` " \ | ||
"to be in cltk_data. Please load this corpus." | ||
self.seed = seed | ||
self.verbose = verbose | ||
|
||
self.token_to_lemmata = [] | ||
self.lemma_to_tokens = [] | ||
|
||
try: | ||
self.token_to_lemmata = open_pickle(os.path.join(self.models_path, "token_to_lemma.pickle")) | ||
except FileNotFoundError as err: | ||
raise type(err)(missing_models_message) | ||
|
||
self._define_lemmatizer() | ||
|
||
def _define_lemmatizer(self): | ||
self.backoff0 = None | ||
self.backoff1 = IdentityLemmatizer(verbose=self.verbose) | ||
self.backoff2 = DictLemmatizer(lemmas=self.token_to_lemmata, source='ReferenzKorpus Mittelhochdeutsch Lemmata', | ||
backoff=self.backoff1, verbose=self.verbose) | ||
self.lemmatizer = self.backoff2 | ||
|
||
def lemmatize(self, tokens: List[str]): | ||
lemmas = self.lemmatizer.lemmatize(tokens) | ||
return lemmas | ||
|
||
def __repr__(self): | ||
return f'<BackoffMHGLemmatizer v0.1>' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters