Skip to content

Commit

Permalink
Stop fix (#949)
Browse files Browse the repository at this point in the history
Based on the discussion, I'm going to conclude that this PR is complete, but additional PRs may be filed, and discussions continued elsewhere.

* PEP 8, some condition tests and regexp fix

* Improved a bit readability

* Cleaned a bit stops modules

* Updated Stoplist construction docs for all languages.

* Fixed errors

* Fixed docs

* Added CorpusStoplist for Middle High German

* Fixed Old Norse STOPS_LIST

* If numpy and scikit-learn are installed, then we can initialize some attributes for stop list building.

Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
  • Loading branch information
clemsciences and todd-cook committed Jun 7, 2020
1 parent 77ffba7 commit 326f299
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 334 deletions.
21 changes: 10 additions & 11 deletions cltk/stop/classical_chinese.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,37 @@
Code for building and working with stoplists for Classical Chinese
"""

__author__ = ['Patrick J. Burns <patrick@diyclassics.org>'] # Update author list
__author__ = ['Patrick J. Burns <patrick@diyclassics.org>'] # Update author list
__license__ = 'MIT License. See LICENSE.'

from cltk.stop.stop import BaseCorpusStoplist


class CorpusStoplist(BaseCorpusStoplist):

def __init__(self, language='classical_chinese'):
BaseCorpusStoplist.__init__(self, language)
self.punctuation = '。,;?:!、《》'
if not self.numpy_installed or not self.sklearn_installed:
print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. Try installing with `pip install numpy sklearn scipy`.\n\n')
raise ImportError
else:
if self.numpy_installed and self.sklearn_installed:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
self.vectorizer = CountVectorizer(analyzer='char', input='content') # Set df?
self.tfidf_vectorizer = TfidfVectorizer(analyzer='char', input='content')

def _remove_punctuation(self, texts, punctuation):
# Change replacement pattern for 'char' analyzer parameter
translator = str.maketrans({key: "" for key in punctuation})
texts = [text.translate(translator) for text in texts]
translator = str.maketrans({key: "" for key in punctuation})
texts = [text.translate(translator) for text in texts]
return texts


if __name__ == "__main__":
test_1 = """方廣錩〔題解〕《天竺國菩提達摩禪師論》,又名《達摩禪師論》,中國僧人假託禪宗初祖菩提達摩所撰典籍,著者不詳,一卷。在敦煌遺書中,"""
test_1 = "方廣錩〔題解〕《天竺國菩提達摩禪師論》,又名《達摩禪師論》,中國僧人假託禪宗初祖菩提達摩所撰典籍," \
"著者不詳,一卷。在敦煌遺書中,"

test_2 = """至今已經發現兩種題名為《達摩禪師論》的文獻。其一為日本橋本凝胤所藏,首殘尾存,尾題作「達摩禪師論」,係唐高宗開耀元年"""
test_2 = "至今已經發現兩種題名為《達摩禪師論》的文獻。其一為日本橋本凝胤所藏," \
"首殘尾存,尾題作「達摩禪師論」,係唐高宗開耀元年"

test_corpus = [test_1, test_2]

S = CorpusStoplist()
print(S.build_stoplist(test_corpus, size=10,
basis='zou', inc_values=True))
print(S.build_stoplist(test_corpus, size=10, basis='zou', inc_values=True))
13 changes: 5 additions & 8 deletions cltk/stop/latin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,19 @@
from string import punctuation
from cltk.stop.stop import BaseCorpusStoplist


class CorpusStoplist(BaseCorpusStoplist):

def __init__(self, language='latin'):
BaseCorpusStoplist.__init__(self, language)
self.punctuation = punctuation
if not self.numpy_installed or not self.sklearn_installed:
print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. Try installing with `pip install numpy sklearn scipy`.\n\n')
raise ImportError
else:
if self.numpy_installed and self.sklearn_installed:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
self.vectorizer = CountVectorizer(input='content') # Set df?
self.vectorizer = CountVectorizer(input='content') # Set df?
self.tfidf_vectorizer = TfidfVectorizer(input='content')

# Reference lists

# Reference lists
"""This stopword list is taken from the Perseus Hopper source at
``/sgml/reading/build/stoplists``. Source at ``http://sourceforge.net/projects/perseus-hopper/``.
Expand Down Expand Up @@ -58,5 +56,4 @@ def __init__(self, language='latin'):
test_corpus = [test_1, test_2]

S = CorpusStoplist()
print(S.build_stoplist(test_corpus, size=10,
basis='zou', inc_values=True))
print(S.build_stoplist(test_corpus, size=10, basis='zou', inc_values=True))

0 comments on commit 326f299

Please sign in to comment.