##Chunking##

In [2]:
import nltk
trees = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types='NP')
print(trees[3])

(S
  (NP This/DT)
  has/VBZ
  increased/VBN
  (NP the/DT risk/NN)
  of/IN
  (NP the/DT government/NN)
  being/VBG
  forced/VBN
  to/TO
  increase/VB
  (NP base/NN rates/NNS)
  to/TO
  (NP 16/CD %/NN)
  from/IN
  (NP their/PRP$ current/JJ 15/CD %/NN level/NN)
  to/TO
  defend/VB
  (NP the/DT pound/NN)
  ,/,
  (NP economists/NNS)
  and/CC
  (NP foreign/JJ exchange/NN market/NN analysts/NNS)
  say/VBP
  ./.)


In [3]:
trees = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types='VP')
print(trees[3])

(S
  This/DT
  (VP has/VBZ increased/VBN)
  the/DT
  risk/NN
  of/IN
  the/DT
  government/NN
  (VP being/VBG forced/VBN to/TO increase/VB)
  base/NN
  rates/NNS
  to/TO
  16/CD
  %/NN
  from/IN
  their/PRP$
  current/JJ
  15/CD
  %/NN
  level/NN
  (VP to/TO defend/VB)
  the/DT
  pound/NN
  ,/,
  economists/NNS
  and/CC
  foreign/JJ
  exchange/NN
  market/NN
  analysts/NNS
  (VP say/VBP)
  ./.)


Demonstrate tracing by testing two chunkers with different rule ordering.

In [4]:
cp1 = nltk.RegexpParser(r'''
NP: {<DT><JJ.*><NN.*>} #Chunk det+adj+noun
    {<DT|NN.*>+}      #Chunk sequences of DT and noun
    ''')
cp2 = nltk.RegexpParser(r'''
NP:   {<DT|NN.*>+}      #Chunk sequences of DT and noun
     {<DT><JJ.*><NN.*>} #Chunk det+adj+noun
     ''')

Make sample sentence

In [5]:
tagged_tokens = nltk.pos_tag(nltk.word_tokenize("The enchantress clutched the beautiful hair"))
tagged_tokens

[('The', 'DT'),
 ('enchantress', 'NN'),
 ('clutched', 'VBD'),
 ('the', 'DT'),
 ('beautiful', 'JJ'),
 ('hair', 'NN')]

In [6]:
print(cp1.parse(tagged_tokens, trace=1))

# Input:
 <DT>  <NN>  <VBD>  <DT>  <JJ>  <NN> 
# Chunk det+adj+noun:
 <DT>  <NN>  <VBD> {<DT>  <JJ>  <NN>}
# Chunk sequences of DT and noun:
{<DT>  <NN>} <VBD> {<DT>  <JJ>  <NN>}
(S
  (NP The/DT enchantress/NN)
  clutched/VBD
  (NP the/DT beautiful/JJ hair/NN))


In [7]:
print(cp2.parse(tagged_tokens, trace=1))

# Input:
 <DT>  <NN>  <VBD>  <DT>  <JJ>  <NN> 
# Chunk sequences of DT and noun:
{<DT>  <NN>} <VBD> {<DT>} <JJ> {<NN>}
# Chunk det+adj+noun:
{<DT>  <NN>} <VBD> {<DT>} <JJ> {<NN>}
(S
  (NP The/DT enchantress/NN)
  clutched/VBD
  (NP the/DT)
  beautiful/JJ
  (NP hair/NN))


##Collocations##

In [3]:
from nltk.collocations import *
import string, random

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = BigramCollocationFinder.from_words(
    nltk.corpus.brown.words(categories="romance"))
stop_words = nltk.corpus.stopwords.words('english')

finder.apply_word_filter(lambda w: w[0] in string.punctuation)
finder.apply_word_filter(lambda w: w.lower() in stop_words)
finder.nbest(bigram_measures.pmi, 10)

[('A.M.', 'starring'),
 ('A40-AjK', 'Mercedes'),
 ('Air', 'Force'),
 ('Akita', 'prefectures'),
 ('Appian', 'Way'),
 ('Arc', 'de'),
 ('Armed', 'Forces'),
 ('Ash', 'Road'),
 ('Auto', 'Company'),
 ("Best's", 'Liliputian')]

In [4]:
finder = BigramCollocationFinder.from_words(
    nltk.corpus.brown.words(categories="news"))
stop_words = nltk.corpus.stopwords.words('english')

finder.apply_word_filter(lambda w: w[0] in string.punctuation)
finder.apply_word_filter(lambda w: w.lower() in stop_words)
finder.nbest(bigram_measures.chi_sq, 20)

[('1,257,700', 'non-farm'),
 ('100-yard', 'dash'),
 ('1044', 'Chestnut'),
 ('11-7', 'collapse'),
 ('1200', 'Larimer'),
 ('13-5', 'barrage'),
 ('165-unit', 'Harbor'),
 ('1671', 'Nakoma'),
 ('182', 'scholastics'),
 ('2-and-2', 'pitches'),
 ('21-year', 'typhoon'),
 ('22-12', 'upset'),
 ('220-yard', 'par-3'),
 ('2269', 'Serra'),
 ('255', 'Brook'),
 ('2705', 'Fitzhugh'),
 ('2731', 'Pall'),
 ('3-year-old', 'filly'),
 ('325', 'crippled'),
 ('330', 'Woodland')]