# Basic NLP Pipeline
1. Data Collection
2. Tokenization,Stopwords,Stemming
3. Building a common vocab
4. Vectorizing the documents
5. Performing classification/clustering

## 1. Data Collection

In [2]:
from nltk.corpus import brown

In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
data = brown.sents(categories="editorial")
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [5]:
print(len(data))

2997


## 2. Tokenization,Stopwords,Stemming

### Tokenization

In [6]:
text = "It was a very pleasant day, weather was cool and there were light showers. I went to the market to buy some fruits."
print(text)

It was a very pleasant day, weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [11]:
print(sents[0])
words = word_tokenize(sents[0].lower())
print(words)

It was a very pleasant day, weather was cool and there were light showers.
['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


### Stopwords removal

In [14]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
print(sw)
print(len(sw))

{'this', "shan't", 'myself', 'why', 'was', 'mustn', 'i', 'is', 'couldn', 'not', "needn't", "you'll", 'me', 'be', 'aren', 'his', 'about', "doesn't", "hadn't", "don't", "haven't", 'and', 'nor', 'as', "she's", 'who', "didn't", 'didn', 'that', 'all', "mustn't", 'itself', 'off', 'only', 'shan', 'were', "should've", 'from', 'doesn', 'more', 'than', 'theirs', 'how', 'or', 'above', 'now', "it's", 'in', 'if', 'into', 'very', 'am', 'are', 'he', 'until', 'being', "you've", 'does', 'most', 'shouldn', 'because', 'ain', 'my', 'do', 'can', 'needn', 'himself', 'for', 'again', 'what', 'wouldn', 'up', 'no', "isn't", 've', 'by', 'mightn', 'been', 'an', 'some', 'ma', "that'll", 'our', 'yourself', 'against', 'below', 'where', 'so', 'after', 'any', 'each', 'own', 'we', 'will', 're', "weren't", 'same', "won't", 'your', 'ours', 'other', 'further', 'during', 'themselves', 'doing', 's', 'whom', "mightn't", "shouldn't", 'when', 'haven', 'under', 'of', 'don', 'have', 'here', 'isn', 'over', 'a', 'but', 'him', 'wit

### Filter words from sentence

In [26]:
def filterWords(words):
    return [w for w in words if w not in sw]

print(words)
usefulWords = filterWords(words)
print(usefulWords)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']
['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


### Tokenization using Regular Expression
- Problem with word tokenizer - Can't handle complex tokenization
- So we use Regexp Tokenizer class in NLTK

In [16]:
from nltk.tokenize import RegexpTokenizer

In [18]:
text1 = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com ."
print(text1)
regularExp = "[a-zA-Z@]+"
tokenizer = RegexpTokenizer(regularExp)
print(tokenizer.tokenize(text1))

Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com .
['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


### Stemming
- Process that transforms particular words(verbs,plurals) into their radical form
- Preserves the semantics of the sentence without increasing the number of unique tokens
- jumps, jumped, jumping => jump

In [29]:
text3 = "Foxes love to make jumps.The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall"
wordsList = tokenizer.tokenize(text3.lower())
print(wordsList)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [30]:
wordsList = filterWords(wordsList)
print(wordsList)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


### Stemmers
- SnowballStemmer (multilingual)
- PorterStemmer
- LancasterStemmer

In [32]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer
ss = SnowballStemmer('english')
ssFrench = SnowballStemmer('french')
ps = PorterStemmer()
ls = LancasterStemmer()

In [33]:
# PorterStemmer
print(ps.stem("jumped"))
print(ps.stem("jumping"))
print(ps.stem("lovely"))
print(ps.stem("awesome"))

jump
jump
love
awesom


In [34]:
# LancasterStemmer
print(ps.stem("teenager"))
print(ls.stem("teenager"))

teenag
teen


In [37]:
# SnowballStemmer
print(ss.stem('lovely'))
print(ss.stem('teenager'))
# French
print(ssFrench.stem('courais'))
print(ssFrench.stem('courir'))

love
teenag
cour
cour
