Stop fix (#949)

Based on the discussion, I'm going to conclude that this PR is complete, but additional PRs may be filed, and discussions continued elsewhere. * PEP 8, some condition tests and regexp fix * Improved a bit readability * Cleaned a bit stops modules * Updated Stoplist construction docs for all languages. * Fixed errors * Fixed docs * Added CorpusStoplist for Middle High German * Fixed Old Norse STOPS_LIST * If numpy and scikit-learn are installed, then we can initialize some attributes for stop list building. Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
cltk · Jun 7, 2020 · 326f299 · 326f299
1 parent 77ffba7
commit 326f299
Show file tree

Hide file tree

Showing 7 changed files with 312 additions and 334 deletions.
diff --git a/cltk/stop/classical_chinese.py b/cltk/stop/classical_chinese.py
@@ -2,38 +2,37 @@
 Code for building and working with stoplists for Classical Chinese
 """
 
-__author__ = ['Patrick J. Burns <patrick@diyclassics.org>'] # Update author list
+__author__ = ['Patrick J. Burns <patrick@diyclassics.org>']  # Update author list
 __license__ = 'MIT License. See LICENSE.'
 
 from cltk.stop.stop import BaseCorpusStoplist
 
+
 class CorpusStoplist(BaseCorpusStoplist):
 
     def __init__(self, language='classical_chinese'):
         BaseCorpusStoplist.__init__(self, language)
         self.punctuation = '。，；？：！、《》'
-        if not self.numpy_installed or not self.sklearn_installed:
-            print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. Try installing with `pip install numpy sklearn scipy`.\n\n')
-            raise ImportError
-        else:
+        if self.numpy_installed and self.sklearn_installed:
             from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
             self.vectorizer = CountVectorizer(analyzer='char', input='content') # Set df?
             self.tfidf_vectorizer = TfidfVectorizer(analyzer='char', input='content')
 
     def _remove_punctuation(self, texts, punctuation):
         # Change replacement pattern for 'char' analyzer parameter
-        translator = str.maketrans({key: "" for key in punctuation}) 
-        texts = [text.translate(translator) for text in texts] 
+        translator = str.maketrans({key: "" for key in punctuation})
+        texts = [text.translate(translator) for text in texts]
         return texts
 
 
 if __name__ == "__main__":
-    test_1 = """方廣錩〔題解〕《天竺國菩提達摩禪師論》，又名《達摩禪師論》，中國僧人假託禪宗初祖菩提達摩所撰典籍，著者不詳，一卷。在敦煌遺書中，"""
+    test_1 = "方廣錩〔題解〕《天竺國菩提達摩禪師論》，又名《達摩禪師論》，中國僧人假託禪宗初祖菩提達摩所撰典籍，" \
+             "著者不詳，一卷。在敦煌遺書中，"
 
-    test_2 = """至今已經發現兩種題名為《達摩禪師論》的文獻。其一為日本橋本凝胤所藏，首殘尾存，尾題作「達摩禪師論」，係唐高宗開耀元年"""
+    test_2 = "至今已經發現兩種題名為《達摩禪師論》的文獻。其一為日本橋本凝胤所藏，" \
+             "首殘尾存，尾題作「達摩禪師論」，係唐高宗開耀元年"
 
     test_corpus = [test_1, test_2]
 
     S = CorpusStoplist()
-    print(S.build_stoplist(test_corpus, size=10,
-                    basis='zou', inc_values=True))
+    print(S.build_stoplist(test_corpus, size=10, basis='zou', inc_values=True))
diff --git a/cltk/stop/latin.py b/cltk/stop/latin.py
@@ -16,21 +16,19 @@
 from string import punctuation
 from cltk.stop.stop import BaseCorpusStoplist
 
+
 class CorpusStoplist(BaseCorpusStoplist):
 
     def __init__(self, language='latin'):
         BaseCorpusStoplist.__init__(self, language)
         self.punctuation = punctuation
-        if not self.numpy_installed or not self.sklearn_installed:
-            print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. Try installing with `pip install numpy sklearn scipy`.\n\n')
-            raise ImportError
-        else:
+        if self.numpy_installed and self.sklearn_installed:
             from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-            self.vectorizer = CountVectorizer(input='content') # Set df?
+            self.vectorizer = CountVectorizer(input='content')  # Set df?
             self.tfidf_vectorizer = TfidfVectorizer(input='content')
 
-# Reference lists
 
+# Reference lists
 """This stopword list is taken from the Perseus Hopper source at
 ``/sgml/reading/build/stoplists``. Source at ``http://sourceforge.net/projects/perseus-hopper/``.
 
@@ -58,5 +56,4 @@ def __init__(self, language='latin'):
     test_corpus = [test_1, test_2]
 
     S = CorpusStoplist()
-    print(S.build_stoplist(test_corpus, size=10,
-                    basis='zou', inc_values=True))
+    print(S.build_stoplist(test_corpus, size=10, basis='zou', inc_values=True))