In [2]:
import nltk

In [3]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/edmund/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [4]:
from nltk.corpus import brown

In [5]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [9]:
print('Count of words from Brown Corpus = ', 
      format(len(brown.words()), ',d'))

Count of words from Brown Corpus =  1,161,192


In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/edmund/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
input_sentence = "Seth and Becca love to run down to the playground. When the weather is nice they go there."

In [19]:
nltk.word_tokenize(input_sentence)

['Seth',
 'and',
 'Becca',
 'love',
 'to',
 'run',
 'down',
 'to',
 'the',
 'playground',
 '.',
 'When',
 'the',
 'weather',
 'is',
 'nice',
 'they',
 'go',
 'there',
 '.']

In [13]:
from nltk.tokenize import sent_tokenize

In [20]:
print(sent_tokenize(input_sentence))

['Seth and Becca love to run down to the playground.', 'When the weather is nice they go there.']


In [21]:
from nltk.probability import FreqDist

In [22]:
input_data = FreqDist(brown.words())
print(input_data)

<FreqDist with 56057 samples and 1161192 outcomes>


In [23]:
input_data.most_common(10)

[('the', 62713),
 (',', 58334),
 ('.', 49346),
 ('of', 36080),
 ('and', 27915),
 ('to', 25732),
 ('a', 21881),
 ('in', 19536),
 ('that', 10237),
 ('is', 10011)]

In [24]:
from nltk.stem import PorterStemmer 

In [25]:
word_stemmer = PorterStemmer()
word_stemmer.stem('fishing')

'fish'

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/edmund/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [27]:
from nltk.stem import WordNetLemmatizer
word_lemma = WordNetLemmatizer()
word_lemma.lemmatize('fishing')

'fishing'

In [30]:
short_list = brown.words()[:10]
for wrd in short_list:
    print('word =', wrd, ': stem =', word_stemmer.stem(wrd), 
          ': lemma =', word_lemma.lemmatize(wrd))

word = The : stem = the : lemma = The
word = Fulton : stem = fulton : lemma = Fulton
word = County : stem = counti : lemma = County
word = Grand : stem = grand : lemma = Grand
word = Jury : stem = juri : lemma = Jury
word = said : stem = said : lemma = said
word = Friday : stem = friday : lemma = Friday
word = an : stem = an : lemma = an
word = investigation : stem = investig : lemma = investigation
word = of : stem = of : lemma = of


In [31]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/edmund/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [38]:
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(input_sentence)
word_tokens

['Seth',
 'and',
 'Becca',
 'love',
 'to',
 'run',
 'down',
 'to',
 'the',
 'playground',
 '.',
 'When',
 'the',
 'weather',
 'is',
 'nice',
 'they',
 'go',
 'there',
 '.']

In [42]:
input_sentence.replace(".","").split(" ")

['Seth',
 'and',
 'Becca',
 'love',
 'to',
 'run',
 'down',
 'to',
 'the',
 'playground',
 'When',
 'the',
 'weather',
 'is',
 'nice',
 'they',
 'go',
 'there']

In [44]:
input_cleaned = [wrd for wrd in word_tokens if not wrd in stop_words]
input_cleaned

['Seth',
 'Becca',
 'love',
 'run',
 'playground',
 '.',
 'When',
 'weather',
 'nice',
 'go',
 '.']

In [45]:
input_cleaned2 = []
for wrd in word_tokens:
    if wrd not in stop_words:
        input_cleaned2.append(wrd)
        
print(input_cleaned2)

['Seth', 'Becca', 'love', 'run', 'playground', '.', 'When', 'weather', 'nice', 'go', '.']


In [46]:
nltk.download('names')

[nltk_data] Downloading package names to /home/edmund/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [47]:
from nltk.corpus import names

In [48]:
len(names.words('male.txt'))

2943

In [49]:
def gender_features(word):
    return { 'last_letter': word[-1]}

In [50]:
gender_features('Debra')

{'last_letter': 'a'}

In [55]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])
labeled_names[0:5]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male')]

In [53]:
import random

In [56]:
random.shuffle(labeled_names)
print(labeled_names[0:5])

[('Dorie', 'female'), ('Karin', 'female'), ('Verene', 'female'), ('Marty', 'female'), ('Ruddie', 'male')]


In [57]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
print(featuresets[0:5])

[({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'e'}, 'male')]


In [59]:
train_set, test_set = featuresets[500:], featuresets[:500]
print("Training obs count:",len(train_set),"Test obs count:",len(test_set))

Training obs count: 7444 Test obs count: 500


In [66]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Sarah'))

'female'