# 1. Import NLTK library

In [None]:
import nltk
from nltk.stem  import  PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

## Make sure that all NLTK packages are downloaded

In [None]:
nltk.download()

# 2. Choose a text to play with

In [None]:
text = '''I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why did I drop out? It started before I was born. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption.'''

print(text)

# 3. Tokenization
Tokenization is about creating tokens. Tokens are small portions of text. They can be words, more complex phrases, or even the whole sentences.

## 3.1 Tokenizing sentences

In [None]:
text_splitted_dot = text.split('.')

for index in range(len(text_splitted_dot)):
    print(str(index+1)+'.',text_splitted_dot[index])

In [None]:
sentences = sent_tokenize(text)
print(type(sentences))
for index in range(len(sentences)):
    print(str(index+1)+'.',sentences[index])
    

## 3.2 Tokenizing words

In [None]:
text_splitted_space = text.split(' ')

for index in range(20):
    print(str(index+1)+'.',text_splitted_space[index])

In [None]:
words_tokenized = word_tokenize(text)
    
names = ['text_splitted_space'+' | ', 'words_tokenized']
formatted_text = '{:>16}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*38)

for index in range(20):
    output = [text_splitted_space[index], words_tokenized[index]]
    print(formatted_text.format(*output))
    

In [None]:
word_punkt_tokenizer = WordPunctTokenizer()

words_punkt_tokenized = word_punkt_tokenizer.tokenize(text)

names = ['text_splitted_space'+' | ', 'words_tokenized'+' | ', 'words_punkt_tokenized']
formatted_text = '{:>18}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*61)

for index in range(21):
    output = [text_splitted_space[index], words_tokenized[index],words_punkt_tokenized[index]]
    print(formatted_text.format(*output))

## 3.3 Text chunker

In [None]:
# Split the input text into chunks, where each chunk contains N words
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output = []

    current_chunk = []
    count = 0
    for word in input_words:
        current_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(current_chunk))
            count, current_chunk = 0, []

    output.append(' '.join(current_chunk))

    return output

# Define the number of words in each chunk
chunk_size = 6

chunks = chunker(text, chunk_size)
print('\nNumber of text chunks =', len(chunks), '\n')
for i, chunk in enumerate(chunks):
    print('Chunk', i+1, '==>', chunk) 

# 4. Stemming

In [None]:
porter_stemmer = PorterStemmer()
words_stemmed = [porter_stemmer.stem(word) for word in words_tokenized]

stemmed_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [porter_stemmer.stem(word) for word in words]
    stemmed_sentences.append(words)

for index in range(len(stemmed_sentences)):
    print(str(index+1)+'.',stemmed_sentences[index])
    

In [None]:
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*70)

# Stem each word and display the output
for word in words_tokenized[:29]:
    output = [word, porter_stemmer.stem(word), 
            lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))

# 5. Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

words_lemmatized = [lemmatizer.lemmatize(word) for word in words_tokenized]

lemmatized_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_sentences.append(words)
    
for index in range(len(lemmatized_sentences)):
    print(str(index+1)+'.',lemmatized_sentences[index]) 

In [None]:
names = ['snowball stemmer', 'lemmatizer noun', 'lemmatizer verb']
formatted_text = '{:>20}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*88)

for word in words_tokenized:
    output = [word, 
              snowball.stem(word), 
              lemmatizer.lemmatize(word), 
              lemmatizer.lemmatize(word, pos='v')]
    print(formatted_text.format(*output))