Permalink
Browse files

Old norse stopwords (#604)

* Old Norse stopwords with grammars

* Doc for using Old Norse stopwords and the related test.

* ch url formatting
  • Loading branch information...
clemsciences authored and kylepjohnson committed Nov 9, 2017
1 parent f4f7e3b commit 4dd831f7d59b0d2fd93e395e8a1b6032603bf0e2
Showing with 41 additions and 0 deletions.
  1. 0 cltk/stop/old_norse/__init__.py
  2. +17 −0 cltk/tests/test_stop.py
  3. +24 −0 docs/old_norse.rst
No changes.
View
@@ -8,6 +8,7 @@
from cltk.stop.latin.stops import STOPS_LIST as LATIN_STOPS
from cltk.stop.french.stops import STOPS_LIST as FRENCH_STOPS
from cltk.stop.arabic.stopword_filter import stopwords_filter as arabic_stop_filter
from cltk.stop.old_norse.stops import STOPS_LIST as OLD_NORSE_STOPS
from nltk.tokenize.punkt import PunktLanguageVars
import os
import unittest
@@ -58,6 +59,7 @@ def test_latin_stopwords(self):
target_list = ['usque', 'tandem', 'abutere', ',', 'catilina', ',',
'patientia', 'nostra', '?']
self.assertEqual(no_stops, target_list)
def test_arabic_stopwords(self):
"""Test filtering arabic stopwords."""
sentence = 'سُئِل بعض الكُتَّاب عن الخَط، متى يَسْتحِقُ أن يُوصَف بِالجَودةِ؟'
@@ -86,5 +88,20 @@ def test_string_stop_list(self):
stoplist = StringStoplist('latin').build_stoplist(text)
self.assertEqual(stoplist, target_list)
def test_old_norse_stopwords(self):
"""
Test filtering Old Norse stopwords
Sentence extracted from Eiríks saga rauða (http://www.heimskringla.no/wiki/Eir%C3%ADks_saga_rau%C3%B0a)
"""
sentence = 'Þat var einn morgin, er þeir Karlsefni sá fyrir ofan rjóðrit flekk nökkurn, sem glitraði við þeim'
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in OLD_NORSE_STOPS]
print(no_stops)
target_list = ['var', 'einn', 'morgin', ',', 'karlsefni', 'rjóðrit', 'flekk', 'nökkurn', ',', 'glitraði']
self.assertEqual(no_stops, target_list)
if __name__ == '__main__':
unittest.main()
View
@@ -16,3 +16,27 @@ Use ``CorpusImporter()`` or browse the `CLTK GitHub organization <https://github
In [3]: corpus_importer.list_corpora
Out[3]: ['old_norse_text_perseus']
Stopword Filtering
==================
To use the CLTK's built-in stopwords list, We use an example from `Eiríks saga rauða
<http://www.heimskringla.no/wiki/Eir%C3%ADks_saga_rau%C3%B0a>`_:
.. code-block:: python
In [1]: from nltk.tokenize.punkt import PunktLanguageVars
In [2]: from cltk.stop.old_norse.stops import STOPS_LIST
In [3]: sentence = 'Þat var einn morgin, er þeir Karlsefni sá fyrir ofan rjóðrit flekk nökkurn, sem glitraði við þeim'
In [4]: p = PunktLanguageVars()
In [5]: tokens = p.word_tokenize(sentence.lower())
In [6]: [w for w in tokens if not w in STOPS_LIST]
Out[6]: ['var', 'einn', 'morgin', ',', 'karlsefni', 'rjóðrit', 'flekk', 'nökkurn', ',', 'glitraði']

0 comments on commit 4dd831f

Please sign in to comment.