Skip to content

Commit

Permalink
Create very simple stemming algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
gunthercox committed Oct 17, 2018
1 parent b6bb1a2 commit ebaceed
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 0 deletions.
42 changes: 42 additions & 0 deletions chatterbot/stemming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import string
import nltk


class SimpleStemmer(object):
"""
A very simple stemming algorithm that removes stopwords and punctuation.
It then removes the beginning and ending characters of each word.
This should work for any language.
"""

def __init__(self, language='english'):
self.punctuation_table = str.maketrans(dict.fromkeys(string.punctuation))

# Get list of stopwords from the NLTK corpus
self.stopwords = nltk.corpus.stopwords.words(language)
self.stopwords.append('')

def stem(self, text):

# Remove punctuation
text = text.translate(self.punctuation_table)

# Make the text lowercase
text = text.lower()

words = []

# Generate the stemmed text
for word in text.split(' '):

# Remove stopwords
if word not in self.stopwords:

# Chop off the ends of the word
start = len(word) // 4
stop = start * -1
word = word[start:stop]

words.append(word)

return ' '.join(words)[:-1]
28 changes: 28 additions & 0 deletions tests/test_stemming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from unittest import TestCase
from chatterbot import stemming


class StemmerTests(TestCase):

def setUp(self):
self.stemmer = stemming.SimpleStemmer()

def test_stemming(self):
stemmed_text = self.stemmer.stem('Hello, how are you doing on this awesome day?')

self.assertEqual(stemmed_text, 'ell wesom')

def test_string_becomes_lowercase(self):
stemmed_text = self.stemmer.stem('THIS IS HOW IT BEGINS!')

self.assertEqual(stemmed_text, 'egi')

def test_stemming_medium_sized_words(self):
stemmed_text = self.stemmer.stem('Hello, my name is Gunther.')

self.assertEqual(stemmed_text, 'ell am unth')

def test_stemming_long_words(self):
stemmed_text = self.stemmer.stem('I play several orchestra instruments for pleasuer.')

self.assertEqual(stemmed_text, 'la evera chest strumen eas')

0 comments on commit ebaceed

Please sign in to comment.