- Tokenization
- Sequencing
- Padding
- Stemming
- Lemmatization

---
## Tokenization

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentence = ['We love machine learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)

In [4]:
tokenizer.word_index

{'we': 1, 'love': 2, 'machine': 3, 'learning': 4}

In [6]:
# repeating words
sentence = ['we love machine learning and deep learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'we': 2, 'love': 3, 'machine': 4, 'and': 5, 'deep': 6}

In [7]:
# Tokenization in not case sensitive
# Tokenisation removes special characters
sentence = ['@ We love machine LEARNING .....!!! and deep learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'we': 2, 'love': 3, 'machine': 4, 'and': 5, 'deep': 6}

In [8]:
sentences = ['We are learning natural language processing',
             'We have learned computer vision',
             'We are learning from a good trainer']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'we': 1,
 'are': 2,
 'learning': 3,
 'natural': 4,
 'language': 5,
 'processing': 6,
 'have': 7,
 'learned': 8,
 'computer': 9,
 'vision': 10,
 'from': 11,
 'a': 12,
 'good': 13,
 'trainer': 14}

---
## Sequencing

In [9]:
sentences = ['We are learning text preprocessing',
             'Tokenization refers to representing each word as a token',
             'Sequencing refers to representing text as a sequence of tokens',
             'Padding refers to adding zeros to sequences to make them all of same length']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'to': 1,
 'refers': 2,
 'text': 3,
 'representing': 4,
 'as': 5,
 'a': 6,
 'of': 7,
 'we': 8,
 'are': 9,
 'learning': 10,
 'preprocessing': 11,
 'tokenization': 12,
 'each': 13,
 'word': 14,
 'token': 15,
 'sequencing': 16,
 'sequence': 17,
 'tokens': 18,
 'padding': 19,
 'adding': 20,
 'zeros': 21,
 'sequences': 22,
 'make': 23,
 'them': 24,
 'all': 25,
 'same': 26,
 'length': 27}

In [10]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[8, 9, 10, 3, 11],
 [12, 2, 1, 4, 13, 14, 5, 6, 15],
 [16, 2, 1, 4, 3, 5, 6, 17, 7, 18],
 [19, 2, 1, 20, 21, 1, 22, 1, 23, 24, 25, 7, 26, 27]]

In [11]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization, sequencing and padding'])

[[3, 11, 12, 16, 19]]

In [12]:
tokenizer.texts_to_sequences(['Text preprocessing does not involve tokenization, sequencing and padding'])

[[3, 11, 12, 16, 19]]

In [13]:
# Out of vocabulary
sentences = ['We are learning text preprocessing',
             'Tokenization refers to representing each word as a token',
             'Sequencing refers to representing text as a sequence of tokens',
             'Padding refers to adding zeros to sequences to make them all of same length']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'#OOV': 1,
 'to': 2,
 'refers': 3,
 'text': 4,
 'representing': 5,
 'as': 6,
 'a': 7,
 'of': 8,
 'we': 9,
 'are': 10,
 'learning': 11,
 'preprocessing': 12,
 'tokenization': 13,
 'each': 14,
 'word': 15,
 'token': 16,
 'sequencing': 17,
 'sequence': 18,
 'tokens': 19,
 'padding': 20,
 'adding': 21,
 'zeros': 22,
 'sequences': 23,
 'make': 24,
 'them': 25,
 'all': 26,
 'same': 27,
 'length': 28}

In [14]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization, sequencing and padding'])

[[4, 12, 1, 13, 17, 1, 20]]

In [16]:
tokenizer.texts_to_sequences(['Text preprocessing does not involve tokenization, sequencing and padding'])

[[4, 12, 1, 1, 1, 13, 17, 1, 20]]

---
## Padding

In [17]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'we are learning sequencing',
             'we are learning the technique of padding',
             'Machine learning and deep learning are fun']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 6, 5, 2],
 [3, 4, 2, 7],
 [3, 4, 2, 8],
 [3, 4, 2, 9, 10, 11, 12],
 [5, 2, 13, 14, 2, 4, 15]]

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(sequences)
padded_sequences

array([[ 0,  0,  0,  3,  6,  5,  2],
       [ 0,  0,  0,  3,  4,  2,  7],
       [ 0,  0,  0,  3,  4,  2,  8],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]], dtype=int32)

In [20]:
padded_sequences = pad_sequences(sequences, padding = 'pre')
padded_sequences

array([[ 0,  0,  0,  3,  6,  5,  2],
       [ 0,  0,  0,  3,  4,  2,  7],
       [ 0,  0,  0,  3,  4,  2,  8],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]], dtype=int32)

In [21]:
padded_sequences = pad_sequences(sequences, padding = 'post')
padded_sequences

array([[ 3,  6,  5,  2,  0,  0,  0],
       [ 3,  4,  2,  7,  0,  0,  0],
       [ 3,  4,  2,  8,  0,  0,  0],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]], dtype=int32)

In [22]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'we are learning sequencing',
             'we are learning the technique of padding',
             'Machine learning and deep learning are fun',
             'The purpose behind text preprocessing in to give a numerical representation to text data']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 9, 5, 2],
 [3, 4, 2, 10],
 [3, 4, 2, 11],
 [3, 4, 2, 6, 12, 13, 14],
 [5, 2, 15, 16, 2, 4, 17],
 [6, 18, 19, 7, 20, 21, 8, 22, 23, 24, 25, 8, 7, 26]]

In [23]:
padded_sequences = pad_sequences(sequences, padding = 'pre')
padded_sequences

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  9,  5,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  0,  0,  0,  0,  0,  0,  3,  4,  2,  6, 12, 13, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  5,  2, 15, 16,  2,  4, 17],
       [ 6, 18, 19,  7, 20, 21,  8, 22, 23, 24, 25,  8,  7, 26]],
      dtype=int32)

In [24]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8)
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  5,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  6, 12, 13, 14],
       [ 0,  5,  2, 15, 16,  2,  4, 17],
       [ 8, 22, 23, 24, 25,  8,  7, 26]], dtype=int32)

In [25]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8, truncating = 'pre')
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  5,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  6, 12, 13, 14],
       [ 0,  5,  2, 15, 16,  2,  4, 17],
       [ 8, 22, 23, 24, 25,  8,  7, 26]], dtype=int32)

In [26]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8, truncating = 'post')
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  5,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  6, 12, 13, 14],
       [ 0,  5,  2, 15, 16,  2,  4, 17],
       [ 6, 18, 19,  7, 20, 21,  8, 22]], dtype=int32)

---
---

In [27]:
# Complete Process
sentences = ['We love machine learning',
             'We are learning tokenization',
             'we are learning sequencing',
             'we are learning the technique of padding',
             'Machine learning and deep learning are fun',
             'The purpose behind text preprocessing in to give a numerical representation to text data']

tokenizer = Tokenizer(oov_token = '#OOV')                   # Create an object of tokenizer class
tokenizer.fit_on_texts(sentences)                           # Tokenization
sequences = tokenizer.texts_to_sequences(sentences)         # Sequencing
padded_sequences = pad_sequences(sequences, maxlen = 10)    # Padding
padded_sequences

array([[ 0,  0,  0,  0,  0,  0,  3,  9,  5,  2],
       [ 0,  0,  0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  0,  0,  3,  4,  2,  6, 12, 13, 14],
       [ 0,  0,  0,  5,  2, 15, 16,  2,  4, 17],
       [20, 21,  8, 22, 23, 24, 25,  8,  7, 26]], dtype=int32)

# ============================================================================

---
## Stemming

In [28]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('breaking')

'break'

In [29]:
stemmer.stem('Writing')

'write'

In [31]:
print(stemmer.stem('breaks'))
print(stemmer.stem('breaking'))
print(stemmer.stem('broke'))
print(stemmer.stem('broken'))
print(stemmer.stem('changes'))
print(stemmer.stem('changed'))
print(stemmer.stem('changing'))
print(stemmer.stem('writes'))
print(stemmer.stem('writing'))
print(stemmer.stem('running'))
print(stemmer.stem('ran'))
print(stemmer.stem('run'))
print(stemmer.stem('trouble'))
print(stemmer.stem('troubled'))
print(stemmer.stem('troubling'))
print(stemmer.stem('cats'))
print(stemmer.stem('knives'))
print(stemmer.stem('leaves'))
print(stemmer.stem('jumping'))
print(stemmer.stem('jumped'))

break
break
broke
broken
chang
chang
chang
write
write
run
ran
run
troubl
troubl
troubl
cat
knive
leav
jump
jump


---
## Lemmatization

In [32]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [33]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('breaks')

'break'

In [34]:
lemmatizer.lemmatize('breaks', pos = 'v')

'break'

In [37]:
print(lemmatizer.lemmatize('breaks', pos = 'v'))
print(lemmatizer.lemmatize('breaking', pos = 'v'))
print(lemmatizer.lemmatize('broke', pos = 'v'))
print(lemmatizer.lemmatize('broken', pos = 'v'))
print(lemmatizer.lemmatize('changes', pos = 'v'))
print(lemmatizer.lemmatize('changed', pos = 'v'))
print(lemmatizer.lemmatize('changing', pos = 'v'))
print(lemmatizer.lemmatize('writes', pos = 'v'))
print(lemmatizer.lemmatize('writing', pos = 'v'))
print(lemmatizer.lemmatize('running', pos = 'v'))
print(lemmatizer.lemmatize('ran', pos = 'v'))
print(lemmatizer.lemmatize('run', pos = 'v'))
print(lemmatizer.lemmatize('trouble', pos = 'v'))
print(lemmatizer.lemmatize('troubled', pos = 'v'))
print(lemmatizer.lemmatize('troubling', pos = 'v'))
print(lemmatizer.lemmatize('cats', pos = 'v'))
print(lemmatizer.lemmatize('knives'))
print(lemmatizer.lemmatize('leaves'))
print(lemmatizer.lemmatize('jumping', pos = 'v'))
print(lemmatizer.lemmatize('jumped', pos = 'v'))

break
break
break
break
change
change
change
write
write
run
run
run
trouble
trouble
trouble
cat
knife
leaf
jump
jump
