In [90]:
%pylab inline
%precision 6

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


'%.6f'

In [91]:
import pandas as pd
import sklearn as skl
import nltk

In [92]:
pd.options.display.max_colwidth=100
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
rcParams['figure.figsize'] = (8.0, 5.0)

# Classification

In [93]:
from nltk.corpus import movie_reviews
movie_reviews.categories()

['neg', 'pos']

In [94]:
len( movie_reviews.fileids() )

2000

In [95]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
np.random.seed(0)
random.shuffle(documents)

In [96]:
documents[0][1]

'neg'

In [97]:
documents[0][0]

['arnold',
 'schwarzenegger',
 'has',
 'been',
 'an',
 'icon',
 'for',
 'action',
 'enthusiasts',
 ',',
 'since',
 'the',
 'late',
 '80',
 "'",
 's',
 ',',
 'but',
 'lately',
 'his',
 'films',
 'have',
 'been',
 'very',
 'sloppy',
 'and',
 'the',
 'one',
 '-',
 'liners',
 'are',
 'getting',
 'worse',
 '.',
 'it',
 "'",
 's',
 'hard',
 'seeing',
 'arnold',
 'as',
 'mr',
 '.',
 'freeze',
 'in',
 'batman',
 'and',
 'robin',
 ',',
 'especially',
 'when',
 'he',
 'says',
 'tons',
 'of',
 'ice',
 'jokes',
 ',',
 'but',
 'hey',
 'he',
 'got',
 '15',
 'million',
 ',',
 'what',
 "'",
 's',
 'it',
 'matter',
 'to',
 'him',
 '?',
 'once',
 'again',
 'arnold',
 'has',
 'signed',
 'to',
 'do',
 'another',
 'expensive',
 'blockbuster',
 ',',
 'that',
 'can',
 "'",
 't',
 'compare',
 'with',
 'the',
 'likes',
 'of',
 'the',
 'terminator',
 'series',
 ',',
 'true',
 'lies',
 'and',
 'even',
 'eraser',
 '.',
 'in',
 'this',
 'so',
 'called',
 'dark',
 'thriller',
 ',',
 'the',
 'devil',
 '(',
 'gabriel

In [98]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [word for (word,count) in all_words.most_common(2000)]

In [99]:
word_features

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'his',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'her',
 'all',
 '?',
 'there',
 'like',
 'so',
 'out',
 'about',
 'up',
 'more',
 'what',
 'when',
 'which',
 'or',
 'she',
 'their',
 ':',
 'some',
 'just',
 'can',
 'if',
 'we',
 'him',
 'into',
 'even',
 'only',
 'than',
 'no',
 'good',
 'time',
 'most',
 'its',
 'will',
 'story',
 'would',
 'been',
 'much',
 'character',
 'also',
 'get',
 'other',
 'do',
 'two',
 'well',
 'them',
 'very',
 'characters',
 ';',
 'first',
 '--',
 'after',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'any',
 'does',
 'really',
 'had',
 'while',
 'films',
 'how',
 'plot',
 'little',
 'where',
 'people',
 'over',
 'could',
 'then',
 'me',
 'scene',
 'man',
 'bad',
 '

In [100]:
def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [101]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(dramatic)': False, 'contains(devil)': False, 'contains(forces)': False, 'contains(ride)': False, 'contains(sent)': False, 'contains(artist)': False, 'contains(set)': False, 'contains(never)': True, 'contains(thanks)': False, 'contains(wife)': True, 'contains(later)': False, 'contains(bloody)': False, 'contains(walk)': False, 'contains(sub)': False, 'contains(gone)': False, 'contains(thomas)': False, 'contains(probably)': False, 'contains(fault)': False, 'contains(taken)': False, 'contains(complex)': False, 'contains(include)': False, 'contains(for)': True, 'contains(stupid)': False, 'contains(use)': False, 'contains(spice)': False, 'contains(harry)': False, 'contains(evil)': False, 'contains(look)': True, 'contains(truly)': False, 'contains(television)': False, 'contains(hill)': False, 'contains(changed)': False, 'contains(feature)': False, 'contains(faces)': False, 'contains(wonderfully)': False, 'contains(cinematic)': False, 'contains(damon)': False, 'contains(explain)': F

In [103]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [104]:
print(nltk.classify.accuracy(classifier, test_set))

0.77


In [105]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.3 : 1.0
         contains(damon) = True              pos : neg    =     11.5 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
          contains(lame) = True              neg : pos    =      6.1 : 1.0


Quality metrics: 
accuracy

It is misleading in web search, where most documents are irrelevant. 
Classifier, always labelling document as irrelevant gives almost 100% accuracy.

* TP, FP, TN, FN
* Precision = TP/(labelled Positive) = TP/(TP+FP)
* Recall = TP/(really Positive) = TP/(TP+FN)
* F-Measure (or F-Score) - harmonic mean of the precision and recall: (2 × Precision × Recall) / (Precision + Recall).

confusion matrix  

cross-Validation
 * more economic use of data
 * allows to see how performance varies with training set

# Part of speech tagging

In [106]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [108]:
suffix_fdist['es']

22408

In [109]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [110]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [112]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [113]:
np.random.shuffle(featuresets)

In [114]:
train_set, test_set = featuresets[:1000], featuresets[1000:2000]

In [115]:
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [116]:
nltk.classify.accuracy(classifier, test_set)

0.570000

In [117]:
pos_features('cats')

{"endswith('')": False,
 "endswith(')": False,
 "endswith('s)": False,
 'endswith(()': False,
 'endswith())': False,
 'endswith(,)': False,
 'endswith(--)': False,
 'endswith(.)': False,
 'endswith(:)': False,
 'endswith(;)': False,
 'endswith(?)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(a)': False,
 'endswith(ad)': False,
 'endswith(al)': False,
 'endswith(an)': False,
 'endswith(and)': False,
 'endswith(are)': False,
 'endswith(as)': False,
 'endswith(at)': False,
 'endswith(ay)': False,
 'endswith(be)': False,
 'endswith(by)': False,
 'endswith(c)': False,
 'endswith(ce)': False,
 'endswith(ch)': False,
 'endswith(d)': False,
 'endswith(e)': False,
 'endswith(ed)': False,
 'endswith(en)': False,
 'endswith(ent)': False,
 'endswith(er)': False,
 'endswith(ere)': False,
 'endswith(ers)': False,
 'endswith(es)': False,
 'endswith(ey)': False,
 'endswith(f)': False,
 'endswith(for)': False,
 'endswith(g)': False,
 'endswith(h)': False,
 'endswith(had)': False,
 

In [118]:
classifier.classify(pos_features('cats'))

'NNS'

In [119]:
classifier.classify(pos_features('walked'))

'VBD'

In [120]:
print(classifier.pseudocode(depth=4))

if endswith(he) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return 'VBD'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(was) == False: return 'NNS-TL'
      if endswith(was) == True: return 'BEDZ'
  if endswith(,) == True: return ','
if endswith(he) == True: 
  if endswith(the) == False: return 'PPS'
  if endswith(the) == True: return 'AT'



In [121]:
def pos_features(sentence, i): 
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [122]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [123]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
         featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.5)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.757464

greedy sequence classification

In [124]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [129]:
set([post.get('class') for post in posts])

{'Accept',
 'Bye',
 'Clarify',
 'Continuer',
 'Emotion',
 'Emphasis',
 'Greet',
 'Other',
 'Reject',
 'Statement',
 'System',
 'nAnswer',
 'whQuestion',
 'yAnswer',
 'ynQuestion'}

In [125]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return feature

In [126]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
                    for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

NameError: name 'feature' is not defined

# Dialog act types

In [None]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [None]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [None]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
set([cl for (post,cl) in featuresets])

# Textual Entailment

Challenge 3, Pair 34 (True)

T: Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.

H: China is a member of SCO.

Challenge 3, Pair 81 (False)

T: According to NC Articles of Organization, the members of LLC company are H. Nelson Beavers, III, H. Chester Beavers and Jennie Beavers Stewart.

H: Jennie Beavers Stewart is a share-holder of Carolina Analytical Laboratory.