# NLTK

Descargar corpus y modelos.

In [1]:
import nltk
nltk.download()
# instalar corpus gutenberg y modelo punkt (tokenizador y segmentador)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

Alternativamente:

In [2]:
import nltk
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /home/jmperez/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/jmperez/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
gutenberg.sents('austen-emma.txt')

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...]

# Estadísticas Básicas

Versión básica con diccionarios:

In [5]:
count = {}

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
count

{'[': 2,
 'Emma': 865,
 'by': 558,
 'Jane': 301,
 'Austen': 1,
 '1816': 1,
 ']': 1,
 'VOLUME': 3,
 'I': 3178,
 'CHAPTER': 55,
 'Woodhouse': 313,
 ',': 11454,
 'handsome': 37,
 'clever': 27,
 'and': 4672,
 'rich': 14,
 'with': 1187,
 'a': 3004,
 'comfortable': 34,
 'home': 130,
 'happy': 122,
 'disposition': 24,
 'seemed': 141,
 'to': 5183,
 'unite': 3,
 'some': 248,
 'of': 4279,
 'the': 4844,
 'best': 85,
 'blessings': 6,
 'existence': 8,
 ';': 2199,
 'had': 1606,
 'lived': 25,
 'nearly': 14,
 'twenty': 30,
 '-': 574,
 'one': 413,
 'years': 57,
 'in': 2118,
 'world': 81,
 'very': 1151,
 'little': 354,
 'distress': 19,
 'or': 490,
 'vex': 1,
 'her': 2381,
 '.': 6928,
 'She': 562,
 'was': 2385,
 'youngest': 4,
 'two': 171,
 'daughters': 7,
 'most': 243,
 'affectionate': 9,
 'indulgent': 2,
 'father': 207,
 'consequence': 27,
 'sister': 33,
 "'": 1007,
 's': 933,
 'marriage': 34,
 'been': 759,
 'mistress': 11,
 'his': 1088,
 'house': 95,
 'from': 535,
 'early': 40,
 'period': 18,
 'Her': 

Versión mejorada con defaultdicts:

In [6]:
from collections import defaultdict

count = defaultdict(int)

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        count[word] += 1

Mejor aun: usar FreqDist

In [17]:
sents = gutenberg.sents('austen-emma.txt')

freqs = nltk.FreqDist()

for sent in sents:
    freqs += nltk.FreqDist(sent)

In [19]:
freqs.most_common(10)

[(',', 11454),
 ('.', 6928),
 ('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('I', 3178),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381)]

In [23]:
print('10 palabras más frecuentes:', sorted(count.items(), key=lambda x: -x[1])[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484


Versión usando clase Counter:

In [20]:
from collections import Counter

count = Counter()

for sent in gutenberg.sents('austen-emma.txt'):
    count.update(sent)

In [21]:
print('10 palabras más frecuentes:', count.most_common()[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484


# Corpus de Texto Plano

- http://www.nltk.org/api/nltk.corpus.reader.html#nltk.corpus.reader.plaintext.PlaintextCorpusReader
- http://www.nltk.org/book/ch02.html

Primero crear archivo example.txt: "Estimados Sr. y sra. Gómez. Se los cita por el art. 32 de la ley 21.234."

In [22]:
from nltk.corpus import PlaintextCorpusReader

help(PlaintextCorpusReader)

Help on class PlaintextCorpusReader in module nltk.corpus.reader.plaintext:

class PlaintextCorpusReader(nltk.corpus.reader.api.CorpusReader)
 |  Reader for corpora that consist of plaintext documents.  Paragraphs
 |  are assumed to be split using blank lines.  Sentences and words can
 |  be tokenized using the default tokenizers, or by custom tokenizers
 |  specificed as parameters to the constructor.
 |  
 |  This corpus reader can be customized (e.g., to skip preface
 |  sections of specific document formats) by creating a subclass and
 |  overriding the ``CorpusView`` class variable.
 |  
 |  Method resolution order:
 |      PlaintextCorpusReader
 |      nltk.corpus.reader.api.CorpusReader
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>), sent_tokenizer=<nltk.tokenize.punkt.PunktSentenceTokenizer ob

In [25]:
corpus = PlaintextCorpusReader('.', 'example.txt')

In [26]:
list(corpus.sents())

[['hola',
  'soy',
  'el',
  'sapo',
  'pepe',
  'soy',
  'pepe',
  'y',
  'me',
  'gusta',
  'comer',
  'moscas',
  'soy',
  'pepe',
  'pero',
  'no',
  'me',
  'gusta',
  'comer',
  'sapos',
  'soy',
  'un',
  'sapo',
  'manco',
  'y',
  'me',
  'quedo',
  'dormido']]

# Tokenización

- http://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.regexp.RegexpTokenizer
- http://www.nltk.org/book/ch03.html#regular-expressions-for-tokenizing-text

De la documentación de NLTK obtenemos una expresión regular para tokenizar:

In [27]:
pattern = r'''(?x)    # set flag to allow verbose regexps
     (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''

Lo probamos:

In [28]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

[['hola',
  'soy',
  'el',
  'sapo',
  'pepe',
  'soy',
  'pepe',
  'y',
  'me',
  'gusta',
  'comer',
  'moscas',
  'soy',
  'pepe',
  'pero',
  'no',
  'me',
  'gusta',
  'comer',
  'sapos',
  'soy',
  'un',
  'sapo',
  'manco',
  'y',
  'me',
  'quedo',
  'dormido']]

Vemos que tokeniza mal todas las abreviaciones y el número "21.234".
Mejoramos la expresión regular y probamos:

In [29]:
pattern = r'''(?x)    # set flag to allow verbose regexps
   (?:\d{1,3}(?:\.\d{3})+)  # numbers with '.' in the middle
   | (?:[Ss]r\.|[Ss]ra\.|art\.)  # common spanish abbreviations
   | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

[['hola',
  'soy',
  'el',
  'sapo',
  'pepe',
  'soy',
  'pepe',
  'y',
  'me',
  'gusta',
  'comer',
  'moscas',
  'soy',
  'pepe',
  'pero',
  'no',
  'me',
  'gusta',
  'comer',
  'sapos',
  'soy',
  'un',
  'sapo',
  'manco',
  'y',
  'me',
  'quedo',
  'dormido']]

Ahora tokeniza bien!!

(La segmentación en oraciones sigue estando mal, pero resolver eso queda fuera de esta clase.)