NLTK https://pythonprogramming.net/stop-words-nltk-tutorial/?completed=/tokenizing-words-sentences-nltk-tutorial/

In [1]:
import nltk

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [4]:
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [5]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


### Stop Words
The process of converting data to something a computer can understand is referred to as "pre-processing." One of the major forms of pre-processing is going to be filtering out useless data. In natural language processing, useless words (data), are referred to as **stop words**.

In [6]:
from nltk.corpus import stopwords

In [7]:
example_sent = "This is a sample sentence, showing off the stop words filtration."


In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
word_tokens = word_tokenize(example_sent)


In [10]:
filtered_sentence = [w for w in word_tokens if not w in stop_words]

In [11]:
filtered_sentence = []

In [12]:
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### Stemming words
Many variations of words carry the same meaning

One of the most popular stemming algorithms is the Porter stemmer, which has been around since 1979.



In [13]:
from nltk.stem import PorterStemmer

In [14]:
ps = PorterStemmer()

In [15]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [16]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [17]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


### Part of Speech Tagging

labeling words in a sentence as nouns, adjectives, verbs...etc. Even more impressive, it also labels by tense, and more.


In [28]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import tkinter 

### corpora 

The NLTK corpus is a massive dump of all kinds of natural language data sets that are definitely worth taking a look at.



### Chunking 

One of the main goals of chunking is to group into what are known as "noun phrases." These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with the words that are in relation to them.

In [19]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [20]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [31]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
        print(str(e))

In [32]:
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

(S
  At/IN
  the/DT
  start/NN
  of/IN
  2006/CD
  ,/,
  more/JJR
  than/IN
  half/PDT
  the/DT
  people/NNS
  of/IN
  our/PRP$
  world/NN
  live/VBP
  in/IN
  democratic/JJ
  nations/NNS
  ./.)
(S
  And/CC
  we/PRP
  do/VBP
  not/RB
  forget/VB
  the/DT
  other/JJ
  half/NN
  --/:
  in/IN
  places/NNS
  like/IN
  (Chunk Syria/NNP)
  and/CC
  (Chunk Burma/NNP)
  ,/,
  (Chunk Zimbabwe/NNP)
  ,/,
  (Chunk North/NNP Korea/NNP)
  ,/,
  and/CC
  (Chunk Iran/NNP)
  --/:
  because/IN
  the/DT
  demands/NNS
  of/IN
  justice/NN
  ,/,
  and/CC
  the/DT
  peace/NN
  of/IN
  this/DT
  world/NN
  ,/,
  require/VBP
  their/PRP$
  freedom/NN
  ,/,
  as/RB
  well/RB
  ./.)
(Chunk Syria/NNP)
(Chunk Burma/NNP)
(Chunk Zimbabwe/NNP)
(Chunk North/NNP Korea/NNP)
(Chunk Iran/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  (Chunk President/NNP George/NNP W./NNP Bush/NNP)
  delivers/VBZ
  his/PRP$
  State/NN
  of/IN
  the/DT
  (Chunk Union/NNP Address/NNP)
  at/IN
  the/DT
  (Chunk Capitol

(S
  (Chunk Iraqis/NNP)
  are/VBP
  showing/VBG
  their/PRP$
  courage/NN
  every/DT
  day/NN
  ,/,
  and/CC
  we/PRP
  are/VBP
  proud/JJ
  to/TO
  be/VB
  their/PRP$
  allies/NNS
  in/IN
  the/DT
  cause/NN
  of/IN
  freedom/NN
  ./.)
(Chunk Iraqis/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  Our/PRP$
  work/NN
  in/IN
  (Chunk Iraq/NNP)
  is/VBZ
  difficult/JJ
  because/IN
  our/PRP$
  enemy/NN
  is/VBZ
  brutal/JJ
  ./.)
(Chunk Iraq/NNP)
(S
  But/CC
  that/DT
  brutality/NN
  has/VBZ
  not/RB
  stopped/VBN
  the/DT
  dramatic/JJ
  progress/NN
  of/IN
  a/DT
  new/JJ
  democracy/NN
  ./.)
(S
  In/IN
  less/JJR
  than/IN
  three/CD
  years/NNS
  ,/,
  the/DT
  nation/NN
  has/VBZ
  gone/VBN
  from/IN
  dictatorship/NN
  to/TO
  liberation/NN
  ,/,
  to/TO
  sovereignty/VB
  ,/,
  to/TO
  a/DT
  constitution/NN
  ,/,
  to/TO
  national/JJ
  elections/NNS
  ./.)
(S
  At/IN
  the/DT
  same/JJ
  time/NN
  ,/,
  our/PRP$
  coalition/NN
  has/VBZ
  been/VBN
  relentl

(S (Chunk Welcome/NNP) ./.)
(Chunk Welcome/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  Our/PRP$
  nation/NN
  is/VBZ
  grateful/JJ
  to/TO
  the/DT
  fallen/VBN
  ,/,
  who/WP
  live/VBP
  in/IN
  the/DT
  memory/NN
  of/IN
  our/PRP$
  country/NN
  ./.)
(S
  We/PRP
  're/VBP
  grateful/JJ
  to/TO
  all/DT
  who/WP
  volunteer/VBP
  to/TO
  wear/VB
  our/PRP$
  nation/NN
  's/POS
  uniform/NN
  --/:
  and/CC
  as/IN
  we/PRP
  honor/VBP
  our/PRP$
  brave/NN
  troops/NNS
  ,/,
  let/VB
  us/PRP
  never/RB
  forget/VBP
  the/DT
  sacrifices/NNS
  of/IN
  (Chunk America/NNP)
  's/POS
  military/JJ
  families/NNS
  ./.)
(Chunk America/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  Our/PRP$
  offensive/JJ
  against/IN
  terror/NN
  involves/VBZ
  more/JJR
  than/IN
  military/JJ
  action/NN
  ./.)
(S
  Ultimately/RB
  ,/,
  the/DT
  only/JJ
  way/NN
  to/TO
  defeat/VB
  the/DT
  terrorists/NNS
  is/VBZ
  to/TO
  defeat/VB
  their/PRP$
  dark/JJ

(S
  Short-changing/VBG
  these/DT
  efforts/NNS
  would/MD
  increase/VB
  the/DT
  suffering/NN
  and/CC
  chaos/NN
  of/IN
  our/PRP$
  world/NN
  ,/,
  undercut/JJ
  our/PRP$
  long-term/JJ
  security/NN
  ,/,
  and/CC
  dull/VB
  the/DT
  conscience/NN
  of/IN
  our/PRP$
  country/NN
  ./.)
(S
  I/PRP
  urge/VBP
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  to/TO
  serve/VB
  the/DT
  interests/NNS
  of/IN
  (Chunk America/NNP)
  by/IN
  showing/VBG
  the/DT
  compassion/NN
  of/IN
  (Chunk America/NNP)
  ./.)
(Chunk Congress/NNP)
(Chunk America/NNP)
(Chunk America/NNP)
(S
  Our/PRP$
  country/NN
  must/MD
  also/RB
  remain/VB
  on/IN
  the/DT
  offensive/JJ
  against/IN
  terrorism/NN
  here/RB
  at/IN
  home/NN
  ./.)
(S
  The/DT
  enemy/NN
  has/VBZ
  not/RB
  lost/VBN
  the/DT
  desire/NN
  or/CC
  capability/NN
  to/TO
  attack/VB
  us/PRP
  ./.)
(S
  Fortunately/RB
  ,/,
  this/DT
  nation/NN
  has/VBZ
  superb/VBN
  professionals/NNS
  in/IN
  law/NN
  enforcement/NN
  ,/

KeyboardInterrupt: 

### Chinking
Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [36]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
        
    except Exception as e:
        print(str(e))

process_content()

(S (Chunk 31/CD ,/, 2006/CD ./.))
(Chunk 31/CD ,/, 2006/CD ./.)
(S
  (Chunk White/NNP House/NNP photo/NN)
  by/IN
  (Chunk Eric/NNP DraperEvery/NNP time/NN I/PRP)
  'm/VBP
  (Chunk invited/JJ)
  to/TO
  this/DT
  (Chunk rostrum/NN ,/, I/PRP)
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  (Chunk privilege/NN ,/, and/CC mindful/NN)
  of/IN
  the/DT
  (Chunk history/NN we/PRP)
  've/VBP
  seen/VBN
  (Chunk together/RB ./.))
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN I/PRP)
(Chunk invited/JJ)
(Chunk rostrum/NN ,/, I/PRP)
(Chunk privilege/NN ,/, and/CC mindful/NN)
(Chunk history/NN we/PRP)
(Chunk together/RB ./.)
(S
  (Chunk We/PRP)
  have/VBP
  gathered/VBN
  under/IN
  this/DT
  (Chunk Capitol/NNP dome/NN)
  in/IN
  (Chunk moments/NNS)
  of/IN
  (Chunk
    national/JJ
    mourning/NN
    and/CC
    national/JJ
    achievement/NN
    ./.))
(Chunk We/PRP)
(Chunk Capitol/NNP dome/NN)
(Chunk moments/NNS)
(Chunk national/JJ mourning/NN and/CC national/JJ achievemen

(S (Chunk (/( Applause/NNP ./. )/)))
(Chunk (/( Applause/NNP ./. )/))
(S
  (Chunk Even/RB)
  in/IN
  the/DT
  (Chunk face/NN)
  of/IN
  (Chunk
    higher/JJR
    energy/NN
    prices/NNS
    and/CC
    natural/JJ
    disasters/NNS
    ,/,)
  the/DT
  (Chunk American/JJ people/NNS)
  have/VBP
  turned/VBN
  in/IN
  an/DT
  (Chunk economic/JJ performance/NN that/WDT)
  is/VBZ
  the/DT
  (Chunk envy/NN)
  of/IN
  the/DT
  (Chunk world/NN ./.))
(Chunk Even/RB)
(Chunk face/NN)
(Chunk
  higher/JJR
  energy/NN
  prices/NNS
  and/CC
  natural/JJ
  disasters/NNS
  ,/,)
(Chunk American/JJ people/NNS)
(Chunk economic/JJ performance/NN that/WDT)
(Chunk envy/NN)
(Chunk world/NN ./.)
(S
  The/DT
  (Chunk American/JJ economy/NN)
  is/VBZ
  (Chunk preeminent/JJ ,/, but/CC we/PRP can/MD not/RB)
  afford/VB
  to/TO
  be/VB
  (Chunk complacent/JJ ./.))
(Chunk American/JJ economy/NN)
(Chunk preeminent/JJ ,/, but/CC we/PRP can/MD not/RB)
(Chunk complacent/JJ ./.)
(S
  In/IN
  a/DT
  (Chunk dynamic/JJ world

(S
  (Chunk Martin/NNP Luther/NNP King/NNP could/MD)
  have/VB
  stopped/VBN
  at/IN
  (Chunk Birmingham/NNP or/CC)
  at/IN
  (Chunk Selma/NNP ,/, and/CC)
  achieved/VBD
  (Chunk only/RB half/PDT)
  a/DT
  (Chunk victory/NN)
  over/IN
  (Chunk segregation/NN ./.))
(Chunk Martin/NNP Luther/NNP King/NNP could/MD)
(Chunk Birmingham/NNP or/CC)
(Chunk Selma/NNP ,/, and/CC)
(Chunk only/RB half/PDT)
(Chunk victory/NN)
(Chunk segregation/NN ./.)
(S
  The/DT
  (Chunk United/NNP States/NNPS could/MD)
  have/VB
  accepted/VBN
  the/DT
  (Chunk permanent/JJ division/NN)
  of/IN
  (Chunk Europe/NNP ,/, and/CC)
  been/VBN
  (Chunk complicit/NNS)
  in/IN
  the/DT
  (Chunk oppression/NN)
  of/IN
  (Chunk others/NNS ./.))
(Chunk United/NNP States/NNPS could/MD)
(Chunk permanent/JJ division/NN)
(Chunk Europe/NNP ,/, and/CC)
(Chunk complicit/NNS)
(Chunk oppression/NN)
(Chunk others/NNS ./.)
(S
  (Chunk Today/NN ,/,)
  having/VBG
  come/VBN
  (Chunk far/RB)
  in/IN
  (Chunk our/PRP$ own/JJ historical/JJ j

### Named Entity Recognition
The idea is to have the machine immediately be able to pull out "entities" like people, places, things, locations, monetary figures, and more.

In [37]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            print(namedEnt)
           
    except Exception as e:
        print(str(e))


process_content()

(S 31/CD ,/, 2006/CD ./.)
(S
  (NE White/NNP House/NNP)
  photo/NN
  by/IN
  (NE Eric/NNP)
  DraperEvery/NNP
  time/NN
  I/PRP
  'm/VBP
  invited/JJ
  to/TO
  this/DT
  rostrum/NN
  ,/,
  I/PRP
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  privilege/NN
  ,/,
  and/CC
  mindful/NN
  of/IN
  the/DT
  history/NN
  we/PRP
  've/VBP
  seen/VBN
  together/RB
  ./.)
(S
  We/PRP
  have/VBP
  gathered/VBN
  under/IN
  this/DT
  Capitol/NNP
  dome/NN
  in/IN
  moments/NNS
  of/IN
  national/JJ
  mourning/NN
  and/CC
  national/JJ
  achievement/NN
  ./.)
(S
  We/PRP
  have/VBP
  served/VBN
  (NE America/NNP)
  through/IN
  one/CD
  of/IN
  the/DT
  most/RBS
  consequential/JJ
  periods/NNS
  of/IN
  our/PRP$
  history/NN
  --/:
  and/CC
  it/PRP
  has/VBZ
  been/VBN
  my/PRP$
  honor/NN
  to/TO
  serve/VB
  with/IN
  you/PRP
  ./.)
(S
  In/IN
  a/DT
  system/NN
  of/IN
  two/CD
  parties/NNS
  ,/,
  two/CD
  chambers/NNS
  ,/,
  and/CC
  two/CD
  elected/JJ
  branches/NNS
  ,/,
  there/EX
  will/MD
 

(S
  Raising/VBG
  up/RP
  a/DT
  democracy/NN
  requires/VBZ
  the/DT
  rule/NN
  of/IN
  law/NN
  ,/,
  and/CC
  protection/NN
  of/IN
  minorities/NNS
  ,/,
  and/CC
  strong/JJ
  ,/,
  accountable/JJ
  institutions/NNS
  that/IN
  last/JJ
  longer/JJR
  than/IN
  a/DT
  single/JJ
  vote/NN
  ./.)
(S
  The/DT
  great/JJ
  people/NNS
  of/IN
  (NE Egypt/NNP)
  have/VBP
  voted/VBN
  in/IN
  a/DT
  multi-party/JJ
  presidential/JJ
  election/NN
  --/:
  and/CC
  now/RB
  their/PRP$
  government/NN
  should/MD
  open/VB
  paths/NNS
  of/IN
  peaceful/JJ
  opposition/NN
  that/WDT
  will/MD
  reduce/VB
  the/DT
  appeal/NN
  of/IN
  radicalism/NN
  ./.)
(S
  The/DT
  (NE Palestinian/JJ)
  people/NNS
  have/VBP
  voted/VBN
  in/IN
  elections/NNS
  ./.)
(S
  And/CC
  now/RB
  the/DT
  leaders/NNS
  of/IN
  (NE Hamas/NNP)
  must/MD
  recognize/VB
  (NE Israel/NNP)
  ,/,
  disarm/NN
  ,/,
  reject/JJ
  terrorism/NN
  ,/,
  and/CC
  work/NN
  for/IN
  lasting/VBG
  peace/NN
  ./.)
(S (/( (N

(S
  And/CC
  we/PRP
  must/MD
  have/VB
  a/DT
  rational/JJ
  ,/,
  humane/JJ
  guest/JJS
  worker/NN
  program/NN
  that/WDT
  rejects/VBZ
  amnesty/JJ
  ,/,
  allows/VBZ
  temporary/JJ
  jobs/NNS
  for/IN
  people/NNS
  who/WP
  seek/VBP
  them/PRP
  legally/RB
  ,/,
  and/CC
  reduces/NNS
  smuggling/VBG
  and/CC
  crime/NN
  at/IN
  the/DT
  border/NN
  ./.)
(S (/( (NE Applause/NNP) ./. )/))
(S
  Keeping/VBG
  (NE America/NNP)
  competitive/JJ
  requires/VBZ
  affordable/JJ
  health/NN
  care/NN
  ./.)
(S (/( (NE Applause/NNP) ./. )/))
(S
  Our/PRP$
  government/NN
  has/VBZ
  a/DT
  responsibility/NN
  to/TO
  provide/VB
  health/NN
  care/NN
  for/IN
  the/DT
  poor/JJ
  and/CC
  the/DT
  elderly/JJ
  ,/,
  and/CC
  we/PRP
  are/VBP
  meeting/VBG
  that/IN
  responsibility/NN
  ./.)
(S (/( (NE Applause/NNP) ./. )/))
(S
  For/IN
  all/DT
  Americans/NNPS
  --/:
  for/IN
  all/DT
  Americans/NNPS
  ,/,
  we/PRP
  must/MD
  confront/VB
  the/DT
  rising/VBG
  cost/NN
  of/IN
  car

### Lemmatizing 

A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.

So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma.

In [38]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


### Wordnet 
 lexical database for the English language

In [39]:
from nltk.corpus import wordnet

In [40]:
syns = wordnet.synsets("program")

In [41]:
print(syns[0].name())

plan.n.01


In [42]:
print(syns[0].lemmas()[0].name())

plan


In [43]:
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [44]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [45]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'honorable', 'in_force', 'beneficial', 'adept', 'safe', 'trade_good', 'well', 'estimable', 'dependable', 'upright', 'near', 'honest', 'in_effect', 'expert', 'unspoiled', 'proficient', 'serious', 'respectable', 'good', 'skilful', 'sound', 'goodness', 'ripe', 'salutary', 'undecomposed', 'right', 'practiced', 'skillful', 'full', 'just', 'secure', 'effective', 'thoroughly', 'soundly', 'dear', 'unspoilt', 'commodity'}
{'badness', 'ill', 'evil', 'bad', 'evilness'}


we can also easily use WordNet to compare the similarity of two words and their tenses, by incorporating the Wu and Palmer method for semantic related-ness.

In [46]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [47]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [48]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


### Text Classification

The goal may be to try to classify text as about politics or the military. May try to classify it by the gender of the author who wrote it. May identify a body of text as either spam or not spam, for things like email filters. In our case, we're going to try to create a sentiment analysis algorithm.

In [49]:
import random
from nltk.corpus import movie_reviews


In each category (we have pos or neg), take all of the file IDs (each review has its own ID), then store the word_tokenized version (a list of words) for the file ID, followed by the positive or negative label in one big list.

In [51]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])


(['a', 'frequent', 'error', 'is', 'the', 'categorization', 'of', 'a', 'terrorist', 'as', 'a', 'soldier', 'or', 'a', 'common', 'criminal', '.', 'a', 'soldier', 'commits', 'acts', 'of', 'violence', 'sanctioned', 'by', 'one', 'nation', '-', 'state', 'against', 'another', '.', 'a', 'common', 'criminal', 'commits', 'acts', 'of', 'violence', 'for', 'personal', 'gain', '.', 'a', 'terrorist', 'employs', 'random', 'violence', 'as', 'a', 'means', 'to', 'a', 'political', 'end', '.', 'if', 'the', 'target', 'nation', 'does', 'not', 'affect', 'the', 'desired', 'change', 'in', 'policy', ',', 'the', 'violence', 'continues', '.', 'it', 'is', 'the', 'fear', 'resulting', 'from', 'the', 'fact', 'that', 'the', 'terrorist', 'may', 'strike', 'at', 'anyone', '(', 'even', 'total', 'innocents', ')', ',', 'anywhere', ',', 'at', 'anytime', 'that', 'gives', 'the', 'figure', 'his', 'label', '.', 'edward', 'zwick', "'", 's', 'the', 'siege', 'explores', 'the', 'possibility', 'of', 'this', 'kind', 'of', 'violence', 't

In [52]:

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))


[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [53]:
print(all_words["stupid"])

253


### Converting words to Features 
feature lists of words from positive reviews and words from the negative reviews

In [54]:
word_features = list(all_words.keys())[:3000]

In [55]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [56]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [57]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

### Naive Bayes Classifier
Sseparate our data into training and testing sets. This is a pretty popular algorithm used in text classification.

Now shuffle data. 1900 shuffled reviews containing both positive and negative. Keep 100 for test

In [78]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]


In [59]:
classifier = nltk.NaiveBayesClassifier.train(training_set)


In [60]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 75.0


In [61]:

classifier.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
                 martian = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0

### Saving Classifiers
use the Pickle module to go ahead and serialize our classifier object, so that all we need to do is load that file in real quick.

In [63]:
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [64]:
classifier_f = open("naivebayes.pickle", "rb")
classifier_saved = pickle.load(classifier_f)
classifier_f.close()

In [65]:
classifier_saved.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
                 martian = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0

### Scikit-Learn Sklearn

In [66]:
from nltk.classify.scikitlearn import SklearnClassifier

In [67]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB


In [93]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.8


In [92]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BernoulliNB_classifier, testing_set))


BernoulliNB accuracy percent: 0.75


In [81]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [82]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)



Original Naive Bayes Algo accuracy percent: 75.0


In [94]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)


LogisticRegression_classifier accuracy percent: 84.0


In [84]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)


SGDClassifier_classifier accuracy percent: 78.0




In [85]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


SVC_classifier accuracy percent: 76.0


In [97]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)


LinearSVC_classifier accuracy percent: 81.0


In [96]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC_classifier accuracy percent: 87.0


### Combining Algorithms

In [88]:
from nltk.classify import ClassifierI
from statistics import mode

In [89]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [90]:
classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()


In [95]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 83.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 85.71428571428571
