Skip to content

Commit

Permalink
Add ascii string conversion preprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
gunthercox committed Jan 11, 2017
1 parent f552aff commit 7aed870
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 36 deletions.
22 changes: 21 additions & 1 deletion chatterbot/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""
Statement pre-processors.
"""
Expand All @@ -23,7 +24,7 @@ def clean_whitespace(chatbot, statement):
def unescape_html(chatbot, statement):
"""
Convert escaped html characters into unescaped html characters.
For example: &lt;b&gt; becomes <b>
For example: "&lt;b&gt;" becomes "<b>".
"""
import sys

Expand All @@ -37,3 +38,22 @@ def unescape_html(chatbot, statement):
statement.text = html.unescape(statement.text)

return statement


def convert_to_ascii(chatbot, statement):
"""
Converts unicode characters to ASCII character equivalents.
For example: "på fédéral" becomes "pa federal".
"""
import unicodedata
import sys

# Normalize unicode characters
if sys.version_info[0] < 3:
statement.text = unicode(statement.text)

text = unicodedata.normalize('NFKD', statement.text)
text = text.encode('ascii', 'ignore').decode('utf-8')

statement.text = str(text)
return statement
16 changes: 0 additions & 16 deletions chatterbot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,6 @@
"""


def clean(text):
"""
A function for cleaning a string of text.
Returns valid ASCII characters.
"""
import unicodedata
import sys

# Normalize unicode characters
if sys.version_info[0] < 3:
text = unicode(text)
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

return str(text)


def import_module(dotted_path):
"""
Imports the specified module based on the
Expand Down
2 changes: 2 additions & 0 deletions docs/preprocessors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ ChatterBot comes with several preprocessors build in.

.. autofunction:: chatterbot.preprocessors.unescape_html

.. autofunction:: chatterbot.preprocessors.convert_to_ascii


Creating new preprocessors
==========================
Expand Down
8 changes: 0 additions & 8 deletions docs/utils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,6 @@ Module Imports

.. autofunction:: chatterbot.utils.import_module

String cleaning
---------------

This package of utility contains methods that are usefull
for cleaning and normalizing strings of text.

.. autofunction:: chatterbot.utils.clean

Terminal input
--------------

Expand Down
13 changes: 13 additions & 0 deletions tests/test_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,16 @@ def test_html_unescape(self):
cleaned = preprocessors.unescape_html(self.chatbot, statement)

self.assertEqual(cleaned.text, normal_text)


class ConvertToASCIIPreprocessorTestCase(ChatBotTestCase):
"""
Make sure that ChatterBot's ASCII conversion preprocessor works as expected.
"""

def test_convert_to_ascii(self):
statement = Statement(u'Klüft skräms inför på fédéral électoral große')
cleaned = preprocessors.convert_to_ascii(self.chatbot, statement)
normal_text = 'Kluft skrams infor pa federal electoral groe'

self.assertEqual(cleaned.text, normal_text)
11 changes: 0 additions & 11 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,3 @@ def test_remove_stop_words(self):
self.assertEqual(len(words), 2)
self.assertIn('test', list(words))
self.assertIn('string', list(words))


class CleanTests(TestCase):

def test_non_ascii_chars_replaced(self):

text = u"Klüft skräms inför på fédéral électoral große"
clean_text = utils.clean(text)
normal_text = "Kluft skrams infor pa federal electoral groe"

self.assertEqual(clean_text, normal_text)

0 comments on commit 7aed870

Please sign in to comment.