# NLP using nltk

In [1]:
from nltk.corpus import brown

# Data Collection

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories = 'editorial')
print(type(data),len(data))
print(data)

<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'> 2997
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


# NLP pipeline
#Data collection
#Tokenisation,Stopward Removal,Stemmning
#Building a common vocab
#vectorise the documents
#Performing Classification Clustering

# Tokenisation and Stopward Removal

In [4]:
text = "It was a very good day until you showed up and ruined it. There were many people around were pleasant"

In [5]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [6]:
sents = sent_tokenize(text)

In [7]:
print(sents)

['It was a very good day until you showed up and ruined it.', 'There were many people around were pleasant']


In [8]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'good', 'day', 'until', 'you', 'showed', 'up', 'and', 'ruined', 'it', '.']


# StopWord Removal

In [9]:
from nltk.corpus import stopwords

In [10]:
sw = set(stopwords.words('english'))

In [11]:
print(sw,len(sw))

{'all', 'what', 'from', "isn't", 'no', 'when', 'themselves', 'or', 'up', 'this', 'with', 'in', 'while', 'for', 'you', 'y', 'some', "that'll", 's', "mightn't", 'such', 'by', 'll', "she's", 'and', 'she', 'myself', 'of', 'whom', 'do', 'but', 'only', 'on', 'ain', 'haven', 'yourself', 'so', 'is', 'more', 'am', 'same', 'to', 'then', 'm', 'after', "you'll", 'can', 'at', "you're", "haven't", 'isn', 'o', 'don', 'other', 're', 'if', "won't", 'hers', 'has', 'further', 'he', 'mightn', "don't", 'ourselves', "wouldn't", 'because', 'that', 'it', "doesn't", 'd', 'over', 'your', 'were', 'during', 'his', 'about', "weren't", 'out', 'below', 'are', 'few', 'not', 'did', 'aren', 'ours', 'itself', 'i', "couldn't", 'both', 'down', "wasn't", 'yours', 'than', 'yourselves', 'weren', "you'd", 'against', 'how', 'its', 'himself', 'mustn', 'they', 'too', "hadn't", 'being', "hasn't", "it's", 'these', 'having', 'me', 'been', 'ma', 'wouldn', 'had', 'theirs', 'couldn', 'we', 'be', "mustn't", 'them', 'most', 'now', "aren

In [12]:
def filter_words(word_list):
    useful_words= [words for words in word_list if words in sw]
    return useful_words

In [13]:
useful_words= filter_words(word_list)
print(useful_words)

['it', 'was', 'a', 'very', 'until', 'you', 'up', 'and', 'it']


In [14]:
from nltk.tokenize import RegexpTokenizer

In [15]:
tokenizer = RegexpTokenizer("[a-z0-9]+")

In [16]:
sents = "send the 50 documents to abc,def,ghi ."
print(tokenizer.tokenize(sents))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


# Stemming
- Process that transforms particular words into root words
-examples jumping,jumps,jumped -> jump

In [17]:
text = "the quick brown fox jumps over the lazy dog from high wall.Foxes loves to  make jump"

In [18]:
word_list = tokenizer.tokenize(text.lower())

In [19]:
print(word_list)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'loves', 'to', 'make', 'jump']


# Type of Stemmer
- Snowball Stemmer(mutlilingual)
- Porter Stemmer
- Lancaster Stemmer

In [20]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [21]:
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer("french")

In [22]:
ps.stem("jumped")

'jump'

In [23]:
ps.stem("jumping")

'jump'

In [24]:
ps.stem("lovely")

'love'

In [25]:
ps.stem("awesome")
ls.stem("awesome")

print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag
