### Converting words to their base form using stemming

In [1]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [2]:
input_words = ['writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly', 'provision', 'hospital', 'kept', 
               'scratchy', 'code']

In [3]:
#Create various stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

In [4]:
#Create a list of stemmer names for display
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
print('\n', formatted_text.format('INPUT WORD', *stemmer_names), '\n', '=' * 68)

#Stem each word and display the output
for word in input_words:
    output = [word, porter.stem(word), lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))


       INPUT WORD          PORTER       LANCASTER        SNOWBALL 
         writing           write            writ           write
          calves            calv            calv            calv
              be              be              be              be
         branded           brand           brand           brand
           horse            hors            hors            hors
       randomize          random          random          random
        possibly         possibl            poss         possibl
       provision          provis          provid          provis
        hospital          hospit          hospit          hospit
            kept            kept            kept            kept
        scratchy        scratchi        scratchy        scratchi
            code            code             cod            code


### Convert words to their base forms using lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
lemmatizer_names = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
formatted_text = '{:>24}' * (len(lemmatizer_names) + 1)
print('\n', formatted_text.format('INPUT WORD', *lemmatizer_names), '\n', '=' * 75)

for word in input_words:
    output = [word, lemmatizer.lemmatize(word, pos = 'n'), lemmatizer.lemmatize(word, pos = 'v')]
    print(formatted_text.format(*output))


               INPUT WORD         NOUN LEMMATIZER         VERB LEMMATIZER 
                 writing                 writing                   write
                  calves                    calf                   calve
                      be                      be                      be
                 branded                 branded                   brand
                   horse                   horse                   horse
               randomize               randomize               randomize
                possibly                possibly                possibly
               provision               provision               provision
                hospital                hospital                hospital
                    kept                    kept                    keep
                scratchy                scratchy                scratchy
                    code                    code                    code


### Dividing text data into chunks

In [8]:
import numpy as np
from nltk.corpus import brown

In [10]:
def chunker(input_data, N):
    #Split the input text into chunks, where each chunk contain N words
    input_words = input_data.split(' ')
    output = []
    
    cur_chunk = []
    count = 0
    
    for word in input_words:
        cur_chunk.append(word)
        count += 1
        
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []
            
    output.append(' '.join(cur_chunk)) #This is to add the last remaining words if possible the length != N
    
    return output

In [11]:
#Read the first 12000 words from the Brown corpus
input_data = ' '.join(brown.words()[:12000])
chunk_size = 700

In [12]:
chunks = chunker(input_data, chunk_size)

In [14]:
print('Number of text chunks = ', len(chunks), '\n')
for i, chunk in enumerate(chunks, start = 1):
    print(f'Chunk {i} ==> {chunk[:50]}')

Number of text chunks =  18 

Chunk 1 ==> The Fulton County Grand Jury said Friday an invest
Chunk 2 ==> '' . ( 2 ) Fulton legislators `` work with city of
Chunk 3 ==> . Construction bonds Meanwhile , it was learned th
Chunk 4 ==> , anonymous midnight phone calls and veiled threat
Chunk 5 ==> Harris , Bexar , Tarrant and El Paso would be $451
Chunk 6 ==> set it for public hearing on Feb. 22 . The proposa
Chunk 7 ==> College . He has served as a border patrolman and 
Chunk 8 ==> of his staff were doing on the address involved co
Chunk 9 ==> plan alone would boost the base to $5,000 a year a
Chunk 10 ==> nursing homes In the area of `` community health s
Chunk 11 ==> of its Angola policy prove harsh , there has been 
Chunk 12 ==> system which will prevent Laos from being used as 
Chunk 13 ==> reform in recipient nations . In Laos , the admini
Chunk 14 ==> . He is not interested in being named a full-time 
Chunk 15 ==> said , `` to obtain the views of the general publi
Chunk 16 ==> '' . M