In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [5]:
example_text = "Hello redhatter, How are you? I'm fine. What about you? :)"

In [6]:
sent_tokenize(example_text)

['Hello redhatter, How are you?', "I'm fine.", 'What about you?', ':)']

In [7]:
word_tokenize(example_text)

['Hello',
 'redhatter',
 ',',
 'How',
 'are',
 'you',
 '?',
 'I',
 "'m",
 'fine',
 '.',
 'What',
 'about',
 'you',
 '?',
 ':',
 ')']

# Stop Words

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [10]:
ex = "Hey Thinkpad! You look so cool."

In [11]:
stop_word = set(stopwords.words("english"))

In [13]:
words = word_tokenize(ex)

In [19]:
filter_sentence = [w for w in words if w not in stop_word]

In [17]:
for w in words:
    if w not in stop_word:
        filter_sentence.append(w)

In [20]:
filter_sentence

['Hey', 'Thinkpad', '!', 'You', 'look', 'cool', '.']

# Stemming

In [24]:
from nltk.stem import PorterStemmer

In [22]:
from nltk.tokenize import word_tokenize

In [26]:
ps = PorterStemmer()

In [27]:
ex = ["python","pythoner","root","rootly","rooted"]

In [33]:
for i in ex:
    print(ps.stem(i))

python
python
root
rootli
root


In [47]:
ex1 = "it is very important for a Redhatly while you are using redhatting redhat server in back. All redhatter are super cool."

In [48]:
print([ps.stem(i) for i in word_tokenize(ex1)])

['it', 'is', 'veri', 'import', 'for', 'a', 'redhatli', 'while', 'you', 'are', 'use', 'redhat', 'redhat', 'server', 'in', 'back', '.', 'all', 'redhatt', 'are', 'super', 'cool', '.']


# Speech tagging

In [49]:
import nltk

In [50]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [53]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [54]:
custom_sent_token = PunktSentenceTokenizer(train_text)

In [55]:
token = custom_sent_token.tokenize(sample_text)

In [56]:
def process_content():
    try:
        for i in token:
            word = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(word)
            print(tagged)
    except:
        print(":(")
    

# Chunking

In [66]:
def process_content_chunk():
    
    for i in token:
        word = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(word)

        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """

        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)

#         chunked.draw()

In [67]:
process_content_chunk()

KeyboardInterrupt: 

# Chinking

In [None]:
def process_content_chink():
    
    for i in token:
        word = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(word)

        chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT|TO>+{ """

        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)


# Lemmatizer

In [68]:
from nltk.stem import WordNetLemmatizer

In [70]:
lemmatizer = WordNetLemmatizer()

In [71]:
print(lemmatizer.lemmatize("better",pos="a"))

good


# Corpus

In [73]:
from nltk.corpus import gutenberg

In [75]:
sample = gutenberg.raw("bible-kjv.txt")

# Word net

In [76]:
from nltk.corpus import wordnet

In [77]:
syn = wordnet.synsets("program")

In [78]:
syn

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [79]:
syn[0]

Synset('plan.n.01')

In [80]:
syn[0].lemmas()

[Lemma('plan.n.01.plan'),
 Lemma('plan.n.01.program'),
 Lemma('plan.n.01.programme')]

In [83]:
syn[0].lemmas()[0].name()

'plan'

In [84]:
syn[0].definition()

'a series of steps to be carried out or goals to be accomplished'

In [85]:
syn[0].examples()

['they drew up a six-step plan', 'they discussed plans for a new bond issue']

In [86]:
syno,anto = [],[]

In [89]:
for syn in wordnet.synsets("good"):
    for i in syn.lemmas():
        syno.append(i.name())
        if i.antonyms():
            anto.append(i.antonyms()[0].name())

In [91]:
set(syno),set(anto)

({'adept',
  'beneficial',
  'commodity',
  'dear',
  'dependable',
  'effective',
  'estimable',
  'expert',
  'full',
  'good',
  'goodness',
  'honest',
  'honorable',
  'in_effect',
  'in_force',
  'just',
  'near',
  'practiced',
  'proficient',
  'respectable',
  'right',
  'ripe',
  'safe',
  'salutary',
  'secure',
  'serious',
  'skilful',
  'skillful',
  'sound',
  'soundly',
  'thoroughly',
  'trade_good',
  'undecomposed',
  'unspoiled',
  'unspoilt',
  'upright',
  'well'},
 {'bad', 'badness', 'evil', 'evilness', 'ill'})

In [92]:
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("boat.n.01")

In [95]:
print(word1.wup_similarity(word2))

0.9090909090909091


In [96]:
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("car.n.01")
print(word1.wup_similarity(word2))

0.6956521739130435


# Text Classification

In [98]:
import nltk
import random
from nltk.corpus import movie_reviews

In [101]:
document = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

In [103]:
random.shuffle(document)

In [113]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

In [114]:
word_feature = list(all_words.keys())[:3000]

In [120]:
def find_feature(doc):
    words = set(doc)
    feature = {}
    for w in word_feature:
        feature[w] = (w in words)
        
    return feature

In [122]:
find_feature(movie_reviews.words("neg/cv000_29416.txt"))

In [124]:
featuresets = [(find_feature(rev),category) for (rev,category) in document]

In [126]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

In [130]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [131]:
nltk.classify.accuracy(classifier,testing_set)*100

87.0

In [132]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
                  turkey = True              neg : pos    =      8.5 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
                  temper = True              pos : neg    =      7.6 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
               atrocious = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0

# Scikit learn

In [133]:
from nltk.classify.scikitlearn import SklearnClassifier

In [142]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [144]:
from sklearn.svm import NuSVC

In [139]:
MNB = SklearnClassifier(MultinomialNB())
MNB.train(training_set)
nltk.classify.accuracy(MNB,testing_set)*100

85.0

In [140]:
MNB = SklearnClassifier(GaussianNB())
MNB.train(training_set)
nltk.classify.accuracy(MNB,testing_set)*100

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [141]:
MNB = SklearnClassifier(BernoulliNB())
MNB.train(training_set)
nltk.classify.accuracy(MNB,testing_set)*100

86.0

In [145]:
MNB = SklearnClassifier(NuSVC())
MNB.train(training_set)
nltk.classify.accuracy(MNB,testing_set)*100

83.0