# 1. Import NLTK library

In [1]:
import nltk
from nltk.stem  import  PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

## Make sure that all NLTK packages are downloaded

In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# 2. Choose a text to play with

In [34]:
text = '''I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 
months or so before I really quit. So why did I drop out? It started before I was born. My biological mother was 
a young, unwed college graduate student, and she decided to put me up for adoption.'''

print(text)

I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 
months or so before I really quit. So why did I drop out? It started before I was born. My biological mother was 
a young, unwed college graduate student, and she decided to put me up for adoption.


# 3. Tokenization
Tokenization is about creating tokens. Tokens are small portions of text. They can be words, more complex phrases, or even the whole sentences.

## 3.1 Tokenizing sentences

In [35]:
text_splitted_dot = text.split('.')

for index in range(len(text_splitted_dot)):
    print(str(index+1)+'.',text_splitted_dot[index])

1. I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 
months or so before I really quit
2.  So why did I drop out? It started before I was born
3.  My biological mother was 
a young, unwed college graduate student, and she decided to put me up for adoption
4. 


In [36]:
sentences = sent_tokenize(text)

for index in range(len(sentences)):
    print(str(index+1)+'.',sentences[index])
    

1. I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 
months or so before I really quit.
2. So why did I drop out?
3. It started before I was born.
4. My biological mother was 
a young, unwed college graduate student, and she decided to put me up for adoption.


## 3.2 Tokenizing words

In [39]:
text_splitted_space = text.split()

for index in range(20):
    print(str(index+1)+'.',text_splitted_space[index])

1. I
2. dropped
3. out
4. of
5. Reed
6. College
7. after
8. the
9. first
10. 6
11. months,
12. but
13. then
14. stayed
15. around
16. as
17. a
18. drop-in
19. for
20. another


In [46]:
words_tokenized = word_tokenize(text)
    
names = ['text_splitted_space'+' | ', 'words_tokenized']
formatted_text = '{:>16}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*38)

for index in range(20):
    output = [text_splitted_space[index], words_tokenized[index]]
    print(formatted_text.format(*output))
    


 text_splitted_space |  words_tokenized 
               I               I
         dropped         dropped
             out             out
              of              of
            Reed            Reed
         College         College
           after           after
             the             the
           first           first
               6               6
         months,          months
             but               ,
            then             but
          stayed            then
          around          stayed
              as          around
               a              as
         drop-in               a
             for         drop-in
         another             for


In [59]:
word_punkt_tokenizer = WordPunctTokenizer()

words_punkt_tokenized = word_punkt_tokenizer.tokenize(text)

names = ['text_splitted_space'+' | ', 'words_tokenized'+' | ', 'words_punkt_tokenized']
formatted_text = '{:>18}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*61)

for index in range(21):
    output = [text_splitted_space[index], words_tokenized[index],words_punkt_tokenized[index]]
    print(formatted_text.format(*output))


 text_splitted_space | words_tokenized | words_punkt_tokenized 
                 I                 I                 I
           dropped           dropped           dropped
               out               out               out
                of                of                of
              Reed              Reed              Reed
           College           College           College
             after             after             after
               the               the               the
             first             first             first
                 6                 6                 6
           months,            months            months
               but                 ,                 ,
              then               but               but
            stayed              then              then
            around            stayed            stayed
                as            around            around
                 a                as                as


## 3.3 Text chunker

In [90]:
# Split the input text into chunks, where each chunk contains N words
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output = []

    cur_chunk = []
    count = 0
    for word in input_words:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []

    output.append(' '.join(cur_chunk))

    return output

# Define the number of words in each chunk
chunk_size = 10

chunks = chunker(text, chunk_size)
print('\nNumber of text chunks =', len(chunks), '\n')
for i, chunk in enumerate(chunks):
    print('Chunk', i+1, '==>', chunk) 


Number of text chunks = 6 

Chunk 1 ==> I dropped out of Reed College after the first 6
Chunk 2 ==> months, but then stayed around as a drop-in for another
Chunk 3 ==> 18 
months or so before I really quit. So why
Chunk 4 ==> did I drop out? It started before I was born.
Chunk 5 ==> My biological mother was 
a young, unwed college graduate student,
Chunk 6 ==> and she decided to put me up for adoption.


# 4. Stemming

In [48]:
porter_stemmer = PorterStemmer()
words_stemmed = [porter_stemmer.stem(word) for word in words_tokenized]

stemmed_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [porter_stemmer.stem(word) for word in words]
    stemmed_sentences.append(words)

for index in range(len(stemmed_sentences)):
    print(str(index+1)+'.',stemmed_sentences[index])
    

1. ['I', 'drop', 'out', 'of', 'reed', 'colleg', 'after', 'the', 'first', '6', 'month', ',', 'but', 'then', 'stay', 'around', 'as', 'a', 'drop-in', 'for', 'anoth', '18', 'month', 'or', 'so', 'befor', 'I', 'realli', 'quit', '.']
2. ['So', 'whi', 'did', 'I', 'drop', 'out', '?']
3. ['It', 'start', 'befor', 'I', 'wa', 'born', '.']
4. ['My', 'biolog', 'mother', 'wa', 'a', 'young', ',', 'unw', 'colleg', 'graduat', 'student', ',', 'and', 'she', 'decid', 'to', 'put', 'me', 'up', 'for', 'adopt', '.']


In [74]:
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*70)

# Stem each word and display the output
for word in words_tokenized[:29]:
    output = [word, porter_stemmer.stem(word), 
            lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))


       INPUT WORD          PORTER       LANCASTER        SNOWBALL 
               I               I               i               i
         dropped            drop            drop            drop
             out             out             out             out
              of              of              of              of
            Reed            reed             ree            reed
         College          colleg          colleg          colleg
           after           after             aft           after
             the             the             the             the
           first           first           first           first
               6               6               6               6
          months           month           month           month
               ,               ,               ,               ,
             but             but             but             but
            then            then            then            then
          stayed      

# 5. Lemmatization

In [63]:
lemmatizer = WordNetLemmatizer()

words_lemmatized = [lemmatizer.lemmatize(word) for word in words_tokenized]

lemmatized_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_sentences.append(words)
    
for index in range(len(lemmatized_sentences)):
    print(str(index+1)+'.',lemmatized_sentences[index]) 

1. ['I', 'dropped', 'out', 'of', 'Reed', 'College', 'after', 'the', 'first', '6', 'month', ',', 'but', 'then', 'stayed', 'around', 'a', 'a', 'drop-in', 'for', 'another', '18', 'month', 'or', 'so', 'before', 'I', 'really', 'quit', '.']
2. ['So', 'why', 'did', 'I', 'drop', 'out', '?']
3. ['It', 'started', 'before', 'I', 'wa', 'born', '.']
4. ['My', 'biological', 'mother', 'wa', 'a', 'young', ',', 'unwed', 'college', 'graduate', 'student', ',', 'and', 'she', 'decided', 'to', 'put', 'me', 'up', 'for', 'adoption', '.']


In [73]:
names = ['snowball stemmer', 'lemmatizer noun', 'lemmatizer verb']
formatted_text = '{:>20}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*88)

for word in words_tokenized:
    output = [word, 
              snowball.stem(word), 
              lemmatizer.lemmatize(word), 
              lemmatizer.lemmatize(word, pos='v')]
    print(formatted_text.format(*output))


           INPUT WORD    snowball stemmer     lemmatizer noun     lemmatizer verb 
                   I                   i                   I                   I
             dropped                drop             dropped                drop
                 out                 out                 out                 out
                  of                  of                  of                  of
                Reed                reed                Reed                Reed
             College              colleg             College             College
               after               after               after               after
                 the                 the                 the                 the
               first               first               first               first
                   6                   6                   6                   6
              months               month               month              months
                   ,     

# 6. Bag of words

In [137]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(sentences)

vocabulary = vectorizer.get_feature_names()

print(vocabulary)
print(X.toarray())

word_nr = 1
sentence_nr = 3
print('Word \"',vocabulary[word_nr],'\" appears', X.toarray()[sentence_nr,word_nr],'times in sentence:', sentences[sentence_nr])


['18', 'adoption', 'after', 'and', 'another', 'around', 'as', 'before', 'biological', 'born', 'but', 'college', 'decided', 'did', 'drop', 'dropped', 'first', 'for', 'graduate', 'in', 'it', 'me', 'months', 'mother', 'my', 'of', 'or', 'out', 'put', 'quit', 'really', 'reed', 'she', 'so', 'started', 'stayed', 'student', 'the', 'then', 'to', 'unwed', 'up', 'was', 'why', 'young']
[[1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 1 0 0 2 0 0 1 1 1 0 1 1 1 0 1 0 1
  0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0
  1 0 0 1 1 1 1 0 1]]
Word " adoption " appears 1 times in sentence: My biological mother was 
a young, unwed college graduate student, and she decided to put me up for adoption.


In [127]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))

X2 = vectorizer2.fit_transform(sentences)

print(vectorizer2.get_feature_names())

print(X2.toarray())

['18 months', 'after the', 'and she', 'another 18', 'around as', 'as drop', 'before really', 'before was', 'biological mother', 'but then', 'college after', 'college graduate', 'decided to', 'did drop', 'drop in', 'drop out', 'dropped out', 'first months', 'for adoption', 'for another', 'graduate student', 'in for', 'it started', 'me up', 'months but', 'months or', 'mother was', 'my biological', 'of reed', 'or so', 'out of', 'put me', 'really quit', 'reed college', 'she decided', 'so before', 'so why', 'started before', 'stayed around', 'student and', 'the first', 'then stayed', 'to put', 'unwed college', 'up for', 'was born', 'was young', 'why did', 'young unwed']
[[1 1 0 1 1 1 1 0 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0 1
  0 0 1 0 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 0 0 0 0