# Basics of NLP
## Topics covered:
1. Tokenization
2. Removing stop words
3. Normalization

In [57]:
!pip install wordcloud



In [58]:
import wordcloud
import nltk       #natural language tool kit

import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string

In [59]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Tokenization
## Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.

In [60]:
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer
sentence_1 = "ram, shyam and bob are good boys. hello world. bob's a nice boy."

## Sentence tokenization
### Delimiters = fullstops

In [61]:
#tokens = sentence
sentence_tokens = nltk.sent_tokenize(sentence_1)

print("tokens = ",sentence_tokens)
print("number of tokens = ",len(sentence_tokens))

tokens =  ['ram, shyam and bob are good boys.', 'hello world.', "bob's a nice boy."]
number of tokens =  3


## White space tokenization
### Delimiter = white space

In [62]:
tk = WhitespaceTokenizer()
whitespace_tokens = tk.tokenize(sentence_1)

print("tokens = ",whitespace_tokens)
print("number of tokens = ",len(whitespace_tokens))

tokens =  ['ram,', 'shyam', 'and', 'bob', 'are', 'good', 'boys.', 'hello', 'world.', "bob's", 'a', 'nice', 'boy.']
number of tokens =  13


## Word punctuation tokenizer
### Seperates punctuation from words

In [63]:
tk = WordPunctTokenizer()
word_punctuation_tokens = tk.tokenize(sentence_1)

print("tokens = ",word_punctuation_tokens)
print("number of tokens = ",len(word_punctuation_tokens))

tokens =  ['ram', ',', 'shyam', 'and', 'bob', 'are', 'good', 'boys', '.', 'hello', 'world', '.', 'bob', "'", 's', 'a', 'nice', 'boy', '.']
number of tokens =  19


## Tree bank Word Tokenizer


In [64]:
tk = TreebankWordTokenizer()
tree_bank_tokens = tk.tokenize(sentence_1)

print("tokens = ",tree_bank_tokens)
print("number of tokens = ",len(tree_bank_tokens))

tokens =  ['ram', ',', 'shyam', 'and', 'bob', 'are', 'good', 'boys.', 'hello', 'world.', 'bob', "'s", 'a', 'nice', 'boy', '.']
number of tokens =  16


------

# Token normalization
## Text normalization is the process of transforming a text into a canonical form. That is, bringing a sentence to a predefined standard. it can be done in two ways,

### 1. Stemming
### 2. Lemmatization

----

# Stop Words

## Before we normalisze, we need to get rid of the stop words, stop words are words that are common in any language like 'the', 'and' etc which dont add value to the analysis

In [65]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

print("stop words count = ", len(stop_words))
print("stop words in english are : \n")
print(stop_words)

stop words count =  179
stop words in english are : 

{'should', 'during', 'was', "didn't", 'why', 'y', 'a', 'few', 'them', 'yourselves', 'out', "mightn't", 'after', 'is', 'can', 'by', 'me', 'd', 'ain', 'does', 'will', 'on', 'be', 'further', 'themselves', "weren't", 'couldn', 'it', 'hadn', 'and', 'below', 'then', "hadn't", "shouldn't", "you've", "doesn't", 'from', 'once', 'ourselves', 'up', 'are', "you'll", "you're", 'himself', 'were', 'any', "you'd", "aren't", 'yours', 'between', 'our', 'if', 'has', 'but', 'of', 'her', 'his', 'over', 'again', 'myself', 'under', 're', 'while', 'don', 'such', 'both', "shan't", 'more', 'isn', 'aren', 'weren', 'most', 'do', 'am', 'whom', "couldn't", 'o', 'here', "wouldn't", 'this', "mustn't", 'each', 'so', 'through', 'who', 'mightn', "it's", 's', 'shan', 'ours', 'what', 'wasn', "hasn't", 'itself', 'very', 'same', 'shouldn', "don't", 'nor', 'll', 'some', 'doing', 'doesn', 'too', 'its', 'about', 'when', "should've", 'you', 'hers', 'until', "needn't", 'your'

In [66]:
# removing stop words
whitespace_tokens_op = [w for w in whitespace_tokens if not w in stop_words]
print("with tokens    :: ",whitespace_tokens)
print("without tokens :: ",whitespace_tokens_op)

with tokens    ::  ['ram,', 'shyam', 'and', 'bob', 'are', 'good', 'boys.', 'hello', 'world.', "bob's", 'a', 'nice', 'boy.']
without tokens ::  ['ram,', 'shyam', 'bob', 'good', 'boys.', 'hello', 'world.', "bob's", 'nice', 'boy.']


# Stemming
## Stemming is the procress of removing or replacing the suffix of a word to get the root words. for example,
### wolf, wolves -> wolf
### talk, talks  -> talk
### bob, bob's   -> bob 

In [67]:
from nltk.stem import PorterStemmer