Skip to content

Commit

Permalink
Convert clean_whitespace utility into a preprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
gunthercox committed Jan 11, 2017
1 parent 81dcf90 commit 5ed7144
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 52 deletions.
14 changes: 14 additions & 0 deletions chatterbot/chatterbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ def __init__(self, name, **kwargs):
self.input.set_chatbot(self)
self.output.set_chatbot(self)

preprocessors = kwargs.get(
'preprocessors', [
'chatterbot.preprocessors.clean_whitespace'
])

self.preprocessors = []

for preprocessor in preprocessors:
self.preprocessors.append(utils.import_module(preprocessor))

# Use specified trainer or fall back to the default
trainer = kwargs.get('trainer', 'chatterbot.trainers.Trainer')
TrainerClass = utils.import_module(trainer)
Expand Down Expand Up @@ -97,6 +107,10 @@ def get_response(self, input_item, session_id=None):

input_statement = self.input.process_input_statement(input_item)

# Preprocess the input statement
for preprocessor in self.preprocessors:
input_statement = preprocessor(self, input_statement)

statement, response, confidence = self.generate_response(input_statement, session_id)

# Learn that the user's input was a valid response to the chat bot's previous output
Expand Down
7 changes: 5 additions & 2 deletions chatterbot/input/gitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,12 @@ def remove_mentions(self, text):
Return a string that has no leading mentions.
"""
import re
from chatterbot.utils import clean_whitespace
text_without_mentions = re.sub(r'@\S+', '', text)
return clean_whitespace(text_without_mentions)

# Remove consecutive spaces
text_without_mentions = re.sub(' +', ' ', text_without_mentions.strip())

return text_without_mentions

def process_input(self, statement):
new_message = False
Expand Down
20 changes: 20 additions & 0 deletions chatterbot/preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
Statement pre-processors.
"""

def clean_whitespace(chatbot, statement):
"""
Remove any extra whitespace and line breaks as needed.
"""
import re

# Replace linebreaks and tabs with spaces
statement.text = statement.text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

# Remove any leeding or trailing whitespace
statement.text = statement.text.strip()

# Remove consecutive spaces
statement.text = re.sub(' +', ' ', statement.text)

return statement
24 changes: 0 additions & 24 deletions chatterbot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,6 @@
"""


def clean_whitespace(text):
"""
Remove any extra whitespace and line breaks as needed.
"""
import re

# Replace linebreaks with spaces
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

# Remove any leeding or trailing whitespace
text = text.strip()

# Remove consecutive spaces
text = re.sub(' +', ' ', text)

return text


def clean(text):
"""
A function for cleaning a string of text.
Expand All @@ -29,11 +11,6 @@ def clean(text):
import unicodedata
import sys

text = clean_whitespace(text)

# Remove links from message
# text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

# Replace HTML escape characters
if sys.version_info[0] < 3:
from HTMLParser import HTMLParser
Expand All @@ -44,7 +21,6 @@ def clean(text):
text = html.unescape(text)

# Normalize unicode characters
# 'raw_input' is just 'input' in python3
if sys.version_info[0] < 3:
text = unicode(text)
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
Expand Down
43 changes: 43 additions & 0 deletions tests/test_preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from .base_case import ChatBotTestCase
from chatterbot.conversation import Statement
from chatterbot import preprocessors


class PreprocessorIntegrationTestCase(ChatBotTestCase):
"""
Make sure that preprocessors work with the chat bot.
"""

def test_clean_whitespace(self):
self.chatbot.preprocessors = [preprocessors.clean_whitespace]
response = self.chatbot.get_response('Hello, how are you?')

self.assertEqual(response.text, 'Hello, how are you?')


class CleanWhitespacePreprocessorTestCase(ChatBotTestCase):
"""
Make sure that ChatterBot's whitespace removing preprocessor works as expected.
"""

def test_clean_whitespace(self):
statement = Statement('\tThe quick \nbrown fox \rjumps over \vthe \alazy \fdog\\.')
cleaned = preprocessors.clean_whitespace(self.chatbot, statement)
normal_text = 'The quick brown fox jumps over \vthe \alazy \fdog\\.'

self.assertEqual(cleaned.text, normal_text)

def test_leading_or_trailing_whitespace_removed(self):
statement = Statement(' The quick brown fox jumps over the lazy dog. ')
cleaned = preprocessors.clean_whitespace(self.chatbot, statement)
normal_text = 'The quick brown fox jumps over the lazy dog.'

self.assertEqual(cleaned.text, normal_text)

def test_consecutive_spaces_removed(self):
statement = Statement('The quick brown fox jumps over the lazy dog.')
cleaned = preprocessors.clean_whitespace(self.chatbot, statement)
normal_text = 'The quick brown fox jumps over the lazy dog.'

self.assertEqual(cleaned.text, normal_text)
26 changes: 0 additions & 26 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,32 +27,6 @@ def test_remove_stop_words(self):
self.assertIn('string', list(words))


class CleanWhitespaceTests(TestCase):

def test_clean_whitespace(self):
text = '\tThe quick \nbrown fox \rjumps over \vthe \alazy \fdog\\.'
clean_text = utils.clean_whitespace(text)
normal_text = 'The quick brown fox jumps over \vthe \alazy \fdog\\.'

self.assertEqual(clean_text, normal_text)

def test_leading_or_trailing_whitespace_removed(self):

text = ' The quick brown fox jumps over the lazy dog. '
clean_text = utils.clean_whitespace(text)
normal_text = 'The quick brown fox jumps over the lazy dog.'

self.assertEqual(clean_text, normal_text)

def test_consecutive_spaces_removed(self):

text = 'The quick brown fox jumps over the lazy dog.'
clean_text = utils.clean_whitespace(text)
normal_text = 'The quick brown fox jumps over the lazy dog.'

self.assertEqual(clean_text, normal_text)


class CleanTests(TestCase):

def test_html_characters_restored(self):
Expand Down

0 comments on commit 5ed7144

Please sign in to comment.