In [1]:
# Chapter 12 (Natural Language Processing)

In [1]:
import textblob
import nltk
from textblob import TextBlob

In [2]:
text = 'Today is a good day. Then tomorrow is likely to be a bad day.'
blob = TextBlob(text)

In [17]:
blob.sentences

[Sentence("Today is a good day."),
 Sentence("Then tomorrow is likely to be a bad day.")]

In [35]:
for sentence in blob.sentences:
    print(sentence, end = '  |  ')

Today is a good day.  |  Then tomorrow is likely to be a bad day.  |  

In [20]:
blob.words

WordList(['Today', 'is', 'a', 'good', 'day', 'Then', 'tomorrow', 'is', 'likely', 'to', 'be', 'a', 'bad', 'day'])

In [34]:
for word in blob.words:
    print(word, end = ' | ')

Today | is | a | good | day | Then | tomorrow | is | likely | to | be | a | bad | day | 

In [27]:
blob.tokens

WordList(['Today', 'is', 'a', 'good', 'day', '.', 'Then', 'tomorrow', 'is', 'likely', 'to', 'be', 'a', 'bad', 'day', '.'])

In [33]:
for token in blob.tokens:
    print(token, end = ' | ')

Today | is | a | good | day | . | Then | tomorrow | is | likely | to | be | a | bad | day | . | 

In [36]:
blob.tags

[('Today', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('day', 'NN'),
 ('Then', 'RB'),
 ('tomorrow', 'NN'),
 ('is', 'VBZ'),
 ('likely', 'JJ'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('bad', 'JJ'),
 ('day', 'NN')]

In [42]:
for tag in blob.tags:
    print(f'{tag[0]} : {tag[1]}', end = ' | ')

Today : NN | is : VBZ | a : DT | good : JJ | day : NN | Then : RB | tomorrow : NN | is : VBZ | likely : JJ | to : TO | be : VB | a : DT | bad : JJ | day : NN | 

In [43]:
blob.noun_phrases

WordList(['good day', 'bad day'])

In [45]:
for noun_phrase in blob.noun_phrases:
    print(noun_phrase, end = ' | ')

good day | bad day | 

In [46]:
blob.sentiment

Sentiment(polarity=3.700743415417188e-17, subjectivity=0.7555555555555555)

In [52]:
print(f'Polarity = {blob.sentiment.polarity}\nSubjectivity = {blob.sentiment.subjectivity}')

Polarity = 3.700743415417188e-17
Subjectivity = 0.7555555555555555


In [11]:
for sentence in blob.sentences:
    print(sentence.sentiment)

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)
Sentiment(polarity=-0.3499999999999999, subjectivity=0.8333333333333333)


In [54]:
from textblob.sentiments import NaiveBayesAnalyzer

In [55]:
blob2 = TextBlob(text, analyzer = NaiveBayesAnalyzer())
blob2.sentiment

Sentiment(classification='pos', p_pos=0.6386587215262682, p_neg=0.3613412784737319)

In [62]:
print(f'Positive = {blob2.sentiment.p_pos:.3f}\nNegative = {blob2.sentiment.p_neg:.3f}\nClassification = {blob2.sentiment.classification}')

Positive = 0.639
Negative = 0.361
Classification = pos


In [63]:
for sentence in blob2.sentences:
    print(sentence.sentiment)

Sentiment(classification='pos', p_pos=0.7265237431528468, p_neg=0.2734762568471531)
Sentiment(classification='neg', p_pos=0.4518237741969971, p_neg=0.5481762258030025)


In [None]:
# Below methods use google translate to perform such tasks, now these are not working for free
blob.detect_language()
blob.translate(to = 'es')

In [66]:
from textblob import Word

In [71]:
w = Word('similarity')
w.pluralize()

'similarities'

In [17]:
w = Word('similarities')
w.singularize()

'similarity'

In [18]:
w = Word('cacti')
w.singularize()

'cactus'

In [19]:
w = Word('cactiK')
w.spellcheck()

[('tactic', 0.3333333333333333),
 ('lactic', 0.3333333333333333),
 ('cactus', 0.3333333333333333)]

In [82]:
w = Word('yew')
w.spellcheck()

[('new', 0.42077831827658096),
 ('yes', 0.23905489923558026),
 ('yet', 0.1695621959694232),
 ('few', 0.15913829047949965),
 ('dew', 0.0034746351633078527),
 ('ye', 0.002779708130646282),
 ('pew', 0.002432244614315497),
 ('jew', 0.0010423905489923557),
 ('yer', 0.00034746351633078526),
 ('yep', 0.00034746351633078526),
 ('yea', 0.00034746351633078526),
 ('sew', 0.00034746351633078526),
 ('hew', 0.00034746351633078526)]

In [76]:
w.correct()

'varieties'

In [84]:
sent = TextBlob('Ths sentnce has soke misspelled wods')
sent.correct()

TextBlob("The sentence has some misspelled words")

In [86]:
word = Word('varieties')
word.stem()

'varieti'

In [87]:
word.lemmatize()

'variety'

In [92]:
from pathlib import Path

In [93]:
blob = TextBlob(Path('./RomeoAndJuliet.txt').read_text(encoding='utf-8'))

In [95]:
# word_counts is a dictionary containing mappings from words to their frequencies in the textblob
print(f'Type of word_counts : {type(blob.word_counts)}')
print(f'Frequency of word "romeo" in the book is {blob.word_counts['romeo']}')

Type of word_counts : <class 'collections.defaultdict'>
Frequency of word "romeo" in the book is 315


In [142]:
happy = Word('happy')
happy.definitions

['enjoying or showing or marked by joy or pleasure',
 'marked by good fortune',
 'eagerly disposed to act or to be of service',
 'well expressed and to the point']

In [146]:
happy.define(pos='n')

['enjoying or showing or marked by joy or pleasure',
 'marked by good fortune',
 'eagerly disposed to act or to be of service',
 'well expressed and to the point']

In [139]:
happy.synsets

[Synset('happy.a.01'),
 Synset('felicitous.s.02'),
 Synset('glad.s.02'),
 Synset('happy.s.04')]

In [141]:
happy.get_synsets(pos='a')

[Synset('happy.a.01'),
 Synset('felicitous.s.02'),
 Synset('glad.s.02'),
 Synset('happy.s.04')]

In [111]:
synonyms = set()
for synset in happy.synsets:
    for lemma in synset.lemmas():
        synonyms.add(lemma.name())

synonyms

{'felicitous', 'glad', 'happy', 'well-chosen'}

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [120]:
from nltk.corpus import stopwords

In [129]:
stops = stopwords.words('english')
print(stops)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she