In [1]:
from nltk.corpus import brown, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Brown corpus from NLTK
print(brown.categories())

data = brown.sents(categories='humor')
print(len(data))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
1053


In [3]:
sent1 = 'Dan told himself he would forget Annabelle.'
doc1 = '''It was a very pleasent day. The weather was very cool.'''

print(sent1)
print(doc1)

Dan told himself he would forget Annabelle.
It was a very pleasent day. The weather was very cool.


In [4]:
# Tokenize sentence into words
print(word_tokenize(sent1))

# Tokenize document into sentences
print(sent_tokenize(doc1))

['Dan', 'told', 'himself', 'he', 'would', 'forget', 'Annabelle', '.']
['It was a very pleasent day.', 'The weather was very cool.']


In [5]:
# Tokenization using Regular Expression

sent2 = 'Contact me at +91xx or gaurav001xx@gmail.com'
regex = RegexpTokenizer('[a-zA-Z]+')

print(regex.tokenize(sent2))

['Contact', 'me', 'at', 'xx', 'or', 'gaurav', 'xx', 'gmail', 'com']


In [6]:
# English Stopwords removal

sw = set(stopwords.words('english'))
print(sw)

def remove_stopwords(words, stopwords):
    useful_words = [w for w in words if w not in stopwords]
    return useful_words

{'out', 'off', 'if', 'that', 'further', 'we', 'when', 'why', 's', 'be', 'wasn', 'which', 'than', 'now', "aren't", 'there', "hasn't", "you've", "won't", 'yourself', 'can', "that'll", 'because', 'aren', 'until', 'same', 'wouldn', 'very', 'shouldn', 'an', 'over', 'couldn', 'then', "you're", "needn't", 'how', 'most', 'haven', 'more', 'what', "you'll", 'down', 'won', 'has', 'only', 'under', 'so', 'just', 'needn', 'mustn', 'a', 'before', 'your', 'this', 'they', "shan't", 'while', 'with', 'whom', 'who', 'below', 'isn', 'into', 'once', 'on', 'each', 'having', 'their', 'no', 'd', "isn't", 'doesn', 'ourselves', 'his', 'is', 'am', 'are', 're', 'few', 'in', 'our', 'or', 'myself', 'above', 'll', 'ain', 'were', 'hers', 'm', 'my', 'had', 'from', 'of', 'y', 'not', 'hasn', 'both', 've', 'will', 'does', 'themselves', 'do', "couldn't", 'some', "she's", 'being', 'about', 'theirs', 'o', "mightn't", 'up', "mustn't", 'itself', 'he', 'such', 'yours', 'weren', 'ours', 'here', 'at', 'yourselves', "should've", "

In [7]:
print(remove_stopwords(sent1.split(), sw))

['Dan', 'told', 'would', 'forget', 'Annabelle.']


In [8]:
# Stemming
ss = SnowballStemmer(language='english')
print(ss.stem('jumping'))

# Lemmatization
wlm = WordNetLemmatizer()
print(wlm.lemmatize('boyss'))

jump
boy


In [9]:
# Building a vocab and vectorization

corpus = ['Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM.',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.']

cv = CountVectorizer()
vect_corp = cv.fit_transform(corpus)

print(cv.vocabulary_)
print(vect_corp.toarray())
print(vect_corp.shape)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}
[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]
(4, 42)


In [10]:
# Reverse mapping
sent = cv.inverse_transform(vect_corp[2])
print(sent)

[array(['the', 'nobel', 'laurate', 'won', 'hearts', 'of', 'people'],
      dtype='<U9')]


In [11]:
# Vectorization with custom tokenizer and stopword removal

def myTokenizer(doc):
    words = regex.tokenize(doc.lower())
    words = remove_stopwords(words, sw)
    return words

cv2 = CountVectorizer(tokenizer=myTokenizer)
vect_corp2 = cv2.fit_transform(corpus)

print(vect_corp2.shape)
print(cv2.inverse_transform(vect_corp2))

(4, 33)
[array(['indian', 'cricket', 'team', 'wins', 'world', 'cup', 'says',
       'capt', 'virat', 'kohli', 'held', 'sri', 'lanka'], dtype='<U9'), array(['indian', 'says', 'win', 'next', 'lok', 'sabha', 'elections',
       'confident', 'pm'], dtype='<U9'), array(['nobel', 'laurate', 'hearts', 'people'], dtype='<U9'), array(['indian', 'movie', 'raazi', 'exciting', 'spy', 'thriller', 'based',
       'upon', 'real', 'story'], dtype='<U9')]


In [12]:
# Testing

test_corpus = ['Indian team rock!']
out_corp = cv2.transform(test_corpus)
print(out_corp.toarray())

[[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]


In [13]:
# N-grams

sents = ['This is good boy.', 'This is not good girl.']

cv3 = CountVectorizer(ngram_range=(2, 3))
vect_corp3 = cv3.fit_transform(sents)

print(cv3.vocabulary_)

{'this is': 8, 'is good': 2, 'good boy': 0, 'this is good': 9, 'is good boy': 3, 'is not': 4, 'not good': 6, 'good girl': 1, 'this is not': 10, 'is not good': 5, 'not good girl': 7}


In [14]:
# Tf-Idf Normalization

tfidf = TfidfVectorizer()
vect_corp = tfidf.fit_transform(sents)

print(tfidf.vocabulary_)
print(vect_corp.toarray())

{'this': 5, 'is': 3, 'good': 2, 'boy': 0, 'not': 4, 'girl': 1}
[[0.63009934 0.         0.44832087 0.44832087 0.         0.44832087]
 [0.         0.53309782 0.37930349 0.37930349 0.53309782 0.37930349]]
