In [37]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords


In [18]:
paragraph = '''Machine learning is evolving fast. Models learn from data. Engineers clean the text carefully. They remove stopwords first. They normalize the words next. Some words are playing. Others were played earlier. People study models daily. They studied them yesterday. Systems are running now. Some systems ran before. Accuracy improves slowly. Better features help models. Good preprocessing matters a lot.'''

In [19]:
stemmer = PorterStemmer()

In [20]:
# stopwords defined in nltk
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [21]:
# tokenize words, apply stemming/lemmatisation to those words and remove them using stopwords
sentences = sent_tokenize(paragraph)
sentences

['Machine learning is evolving fast.',
 'Models learn from data.',
 'Engineers clean the text carefully.',
 'They remove stopwords first.',
 'They normalize the words next.',
 'Some words are playing.',
 'Others were played earlier.',
 'People study models daily.',
 'They studied them yesterday.',
 'Systems are running now.',
 'Some systems ran before.',
 'Accuracy improves slowly.',
 'Better features help models.',
 'Good preprocessing matters a lot.']

In [28]:
# tokenize each word, apply stemming and join it back again to a sentence
processed = []
for sentence in sentences: 
    words = word_tokenize(sentence)
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    processed.append(" ".join(words))

processed

['machin learn evolv fast .',
 'model learn data .',
 'engin clean text care .',
 'they remov stopword first .',
 'they normal word next .',
 'some word play .',
 'other play earlier .',
 'peopl studi model daili .',
 'they studi yesterday .',
 'system run .',
 'some system ran .',
 'accuraci improv slowli .',
 'better featur help model .',
 'good preprocess matter lot .']

In [36]:
# do the same as above using Snowball stemmer
snowball_stemmer = SnowballStemmer('english')

sentences = sent_tokenize(paragraph)

snowballs = []
for sentence in sentences: 
    words = word_tokenize(sentence)
    words = [snowball_stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    snowballs.append(' '.join(words))

sentences, snowballs

(['Machine learning is evolving fast.',
  'Models learn from data.',
  'Engineers clean the text carefully.',
  'They remove stopwords first.',
  'They normalize the words next.',
  'Some words are playing.',
  'Others were played earlier.',
  'People study models daily.',
  'They studied them yesterday.',
  'Systems are running now.',
  'Some systems ran before.',
  'Accuracy improves slowly.',
  'Better features help models.',
  'Good preprocessing matters a lot.'],
 ['machin learn evolv fast .',
  'model learn data .',
  'engin clean text care .',
  'they remov stopword first .',
  'they normal word next .',
  'some word play .',
  'other play earlier .',
  'peopl studi model daili .',
  'they studi yesterday .',
  'system run .',
  'some system ran .',
  'accuraci improv slowli .',
  'better featur help model .',
  'good preprocess matter lot .'])

In [39]:
# do the same using lemmatization

lemmatizer = WordNetLemmatizer()

sentences = sent_tokenize(paragraph)

processed = []

for sentence in sentences: 
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    processed.append(" ".join(words))

processed

['Machine learning evolving fast .',
 'Models learn data .',
 'Engineers clean text carefully .',
 'They remove stopwords first .',
 'They normalize word next .',
 'Some word playing .',
 'Others played earlier .',
 'People study model daily .',
 'They studied yesterday .',
 'Systems running .',
 'Some system ran .',
 'Accuracy improves slowly .',
 'Better feature help model .',
 'Good preprocessing matter lot .']