<a href="https://colab.research.google.com/github/gksthdals/NLTK/blob/main/06.%20Learning_to_Classify_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
""" Main Topic """

# How can we identify particular features of language data that are salient for classifying it?
# How can we construct models of language that can be used to perform language processing tasks automatically?
# What can we learn about language from these models?

## 1. Supervised Classification

### Gender Identification

In [2]:
def gender_features(word):
  return {'last_letter': word[-1]}

gender_features('Shrek')

{'last_letter': 'k'}

In [4]:
import nltk
nltk.download('names')

from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + ([(name, 'female') for name in names.words('female.txt')])

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [5]:
import random

random.shuffle(labeled_names)

In [6]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_features('Neo'))

'male'

In [8]:
classifier.classify(gender_features('Trinity'))

'female'

In [9]:
nltk.classify.accuracy(classifier, test_set)

0.782

In [10]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     36.9 : 1.0
             last_letter = 'k'              male : female =     32.2 : 1.0
             last_letter = 'f'              male : female =     25.4 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


In [11]:
# 데이터의 양이 많을 때

from nltk.classify import apply_features

train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

### Choosing The Right Features

In [12]:
def gender_features2(name):
  features = {}
  features["first_letter"] = name[0].lower()
  features["last_letter"] = name[-1].lower()

  for letter in 'abcdefghijklmnopqrstuvwxyz':
    features["count({})".format(letter)] = name.lower().count(letter)
    features["has({})".format(letter)] = (letter in name.lower())
  
  return features

In [13]:
gender_features2('John')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [14]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.788

In [15]:
# Development Set
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500] # for error analysis

# Test set
test_names = labeled_names[:500]

In [16]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.759

In [17]:
errors = []
for (name, tag) in devtest_names:
  guess = classifier.classify(gender_features(name))
  if guess != tag:
    errors.append((tag, guess, name))

In [18]:
len(errors)

241

In [19]:
for (tag, guess, name) in sorted(errors)[:10]:
  print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ardis                         


In [20]:
def gender_features(word):
  return {'suffix1': word[-1:],
          'suffix2': word[-2:]}

In [21]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.764

### Document Classification

In [22]:
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [23]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
  document_words = set(document)
  features = {}
  for word in word_features:
    features['contain({})'.format(word)] = (word in document_words)
  
  return features

In [24]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contain(plot)': True,
 'contain(:)': True,
 'contain(two)': True,
 'contain(teen)': False,
 'contain(couples)': False,
 'contain(go)': False,
 'contain(to)': True,
 'contain(a)': True,
 'contain(church)': False,
 'contain(party)': False,
 'contain(,)': True,
 'contain(drink)': False,
 'contain(and)': True,
 'contain(then)': True,
 'contain(drive)': False,
 'contain(.)': True,
 'contain(they)': True,
 'contain(get)': True,
 'contain(into)': True,
 'contain(an)': True,
 'contain(accident)': False,
 'contain(one)': True,
 'contain(of)': True,
 'contain(the)': True,
 'contain(guys)': False,
 'contain(dies)': False,
 'contain(but)': True,
 'contain(his)': True,
 'contain(girlfriend)': True,
 'contain(continues)': False,
 'contain(see)': False,
 'contain(him)': True,
 'contain(in)': True,
 'contain(her)': False,
 'contain(life)': False,
 'contain(has)': True,
 'contain(nightmares)': False,
 'contain(what)': True,
 "contain(')": True,
 'contain(s)': True,
 'contain(deal)': False,
 'contain(

In [26]:
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [27]:
nltk.classify.accuracy(classifier, test_set)

0.77

In [28]:
classifier.show_most_informative_features(5)

Most Informative Features
  contain(unimaginative) = True              neg : pos    =      8.2 : 1.0
     contain(schumacher) = True              neg : pos    =      7.3 : 1.0
           contain(mena) = True              neg : pos    =      6.9 : 1.0
         contain(shoddy) = True              neg : pos    =      6.9 : 1.0
         contain(suvari) = True              neg : pos    =      6.9 : 1.0


### Part-of-Speech Tagging

In [29]:
nltk.download('brown')
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
  word = word.lower()
  suffix_fdist[word[-1:]] += 1
  suffix_fdist[word[-2:]] += 1
  suffix_fdist[word[-3:]] += 1

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes

In [31]:
def pos_features(word):
  features = {}
  for suffix in common_suffixes:
    features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
  return features

In [32]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

In [33]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [34]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [35]:
classifier.classify(pos_features('cats'))

'NNS'

In [36]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



### Exploiting Context

In [37]:
def pos_features(sentence, i):
  features = {"suffix(1)": sentence[i][-1:],
              "suffix(2)": sentence[i][-2:],
              "suffix(3)": sentence[i][-3:]}
  
  if i == 0:
    features["prev-word"] = "<START>"
  else:
    features["prev-word"] = sentence[i-1]
  
  return features

In [38]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [40]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
  untagged_sent = nltk.tag.untag(tagged_sent)
  for i, (word, tag) in enumerate(tagged_sent):
    featuresets.append((pos_features(untagged_sent, i), tag))

In [41]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [42]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

### Sequence Classification

In [43]:
def pos_features(sentence, i, history):
  features = {"suffix(1)": sentence[i][-1:],
              "suffix(2)": sentence[i][-2:],
              "suffix(3)": sentence[i][-3:]}
  
  if i == 0:
    features["prev-word"] = "<START>"
    features["prev-tag"] = "<START>"
  
  else:
    features["prev-word"] = sentence[i-1]
    features["prev-tag"] = history[i-1]
  
  return features

In [45]:
class ConsecutivePosTagger(nltk.TaggerI):

  def __init__(self, train_sents):
    train_set = []
    for tagged_sent in train_sents:
      untagged_sent = nltk.tag.untag(tagged_sent)
      history = []
      for i, (word, tag) in enumerate(tagged_sent):
        featureset = pos_features(untagged_sent, i, history)
        train_set.append((featureset, tag))
        history.append(tag)
    
    self.classifier = nltk.NaiveBayesClassifier.train(train_set)

  def tag(self, sentence):
    history = []
    for i, word in enumerate(sentence):
      featureset = pos_features(sentence, i, history)
      tag = self.classifier.classify(featureset)
      history.append(tag)
    return zip(sentence, history)

In [46]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)

train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)

tagger.evaluate(test_sents)

0.7980528511821975

### Other Methods for Sequence Classification

## 2. Further Examples of Supervised Classification

### Sentence Segmentation

In [49]:
nltk.download('treebank')
nltk.download('punkt')
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
  tokens.extend(sent)
  offset += len(sent)
  boundaries.add(offset-1)

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [52]:
def punct_features(tokens, i):
  return {'next-word-capitalized': tokens[i+1][0].isupper(),
          'prev-word': tokens[i-1].lower(),
          'punct': tokens[i],
          'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [53]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [54]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [55]:
def segment_sentences(words):
  start = 0
  sents = []
  for i, word in enumerate(words):
    if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
      sents.append(words[start:i+1])
      start = i+1
  
  if start < len(words):
    sents.append(words[start:])
  return sents

### Identifying Dialogue Act Types

In [56]:
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


In [57]:
def dialogue_act_features(post):
  features = {}
  for word in nltk.word_tokenize(post):
    features['contain({})'.format(word.lower())] = True

  return features

In [58]:
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.668

### Recognizing Textual Entailment

In [None]:
"""
It should be emphasized that the relationship between text and hypothesis is not intended to be logical entailment, 
but rather whether a human would conclude that the text provides reasonable evidence for taking the hypothesis to be true.

"""

In [59]:
def rte_features(rtepair):
  extractor = nltk.RTEFeatureExtractor(rtepair)
  features = {}
  features['word_overlap'] = len(extractor.overlap('word'))
  features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
  features['ne_overlap'] = len(extractor.overlap('ne'))
  features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))

  return features

In [61]:
nltk.download('rte')
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)

[nltk_data] Downloading package rte to /root/nltk_data...
[nltk_data]   Unzipping corpora/rte.zip.


In [62]:
extractor.text_words

{'Asia',
 'China',
 'Co',
 'Davudi',
 'Iran',
 'Organisation',
 'Parviz',
 'Russia',
 'SCO',
 'Shanghai',
 'Soviet',
 'association',
 'at',
 'binds',
 'central',
 'fight',
 'fledgling',
 'former',
 'four',
 'meeting',
 'operation',
 'representing',
 'republics',
 'terrorism.',
 'that',
 'together',
 'was'}

In [63]:
extractor.hyp_words

{'China', 'SCO.', 'member'}

In [64]:
extractor.overlap('word')

set()

In [65]:
extractor.overlap('ne')

{'China'}

In [66]:
extractor.hyp_extra('word')

{'member'}

### Scaling Up to Large Datasets

## 3. Evaluation

### The Test Set

In [67]:
import random
from nltk.corpus import brown

tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [68]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [69]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

### Accuracy

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set)))

### Precision and Recall

In [None]:
""" 
True positives are relevant items that we correctly identified as relevant.
True negatives are irrelevant items that we correctly identified as irrelevant.
False positives (or Type I errors) are irrelevant items that we incorrectly identified as relevant.
False negatives (or Type II errors) are relevant items that we incorrectly identified as irrelevant.

"""

In [None]:
"""
Precision, which indicates how many of the items that we identified were relevant, is TP/(TP+FP).
Recall, which indicates how many of the relevant items that we identified, is TP/(TP+FN).

The F-Measure (or F-Score), which combines the precision and recall to give a single score, 
is defined to be the harmonic mean of the precision and recall: (2 × Precision × Recall) / (Precision + Recall).

"""

### Confusion Matrices

In [72]:
def tag_list(tagged_sents):
  return [tag for sent in tagged_sents for (word, tag) in sent]

def apply_tagger(tagger, corpus):
  return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))

cm = nltk.ConfusionMatrix(gold, test)
cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9)

NameError: ignored

### Cross-Validation

In [None]:
"""
One solution to this problem is to perform multiple evaluations on different test sets,
then to combine the scores from those evaluations, a technique known as cross-validation.

"""

## 4. Decision Trees

### Entropy and Information Gain

In [1]:
import nltk
import math

def entropy(labels):
  freqdist = nltk.FreqDist(labels)
  probs = [freqdist.freq(l) for l in freqdist]
  return -sum(p * math.log(p, 2) for p in probs)

In [2]:
entropy(['male', 'male', 'male', 'male'])

-0.0

In [3]:
entropy(['female', 'male', 'male', 'male'])

0.8112781244591328

In [4]:
entropy(['female', 'female', 'male', 'male'])

1.0

In [5]:
entropy(['female', 'female', 'female', 'male'])

0.8112781244591328

In [6]:
entropy(['female', 'female', 'female', 'female'])

-0.0

In [None]:
# Decision trees are especially well suited to cases where many hierarchical categorical distinctions can be made.

# The fact that decision trees require that features be checked in a specific order limits their ability to exploit features that are relatively independent of one another. 
# The naive Bayes classification method, which we'll discuss next, overcomes this limitation by allowing all features to act "in parallel."

## 5. Naive Bayes Classifiers

### Underlying Probabilistic Model

In [None]:
# P(label) is the prior probability for a given label, and each P(f|label) is the contribution of a single feature to the label likelihood.

### Zero Counts and Smoothing

In [None]:
# P(f|label) = count(f, label) / count(label)

# However, this simple approach can become problematic when a feature never occurs with a given label in the training set.
# The nltk.probability module provides support for a wide variety of smoothing techniques.

### Non-Binary Features

In [None]:
# Numeric features can be converted to binary features by binning, which replaces them with features such as "4<x<6".

### The Naivete of Independence

In [None]:
# The reason that naive Bayes classifiers are called "naive" is that it's unreasonable to assume that all features are independent of one another (given the label).

# One problem that arises is that the classifier can end up "double-counting" the effect of highly correlated features, 
# pushing the classifier closer to a given label than is justified.

### The Cause of Double-Counting

In [None]:
# However, in the next section, we'll look at a classifier that considers the possible interactions between these parameters when choosing their values.

## 6. Maximum Entropy Classifiers

In [None]:
# Therefore, Maximum Entropy classifiers choose the model parameters using iterative optimization techniques, which initialize the model's parameters to random values, 
# and then repeatedly refine those parameters to bring them closer to the optimal solution. 

### The Maximum Entropy Model

In [None]:
# The Maximum Entropy classifier model leaves it up to the user to decide what combinations of labels and features should receive their own parameters.
# Each combination of labels and features that receives its own parameter is called a joint-feature.

### Maximizing Entropy

In [None]:
# In general, the Maximum Entropy principle states that, 
# among the distributions that are consistent with what we know, 
# we should choose the distribution whose entropy is highest.

# Throughout this example, we have restricted ourselves to distributions that are consistent with what we know; among these, 
# we chose the distribution with the highest entropy.

### Generative vs Conditional Classifiers

## 7. Modeling Linguistic Patterns

In [None]:
# The models help us to understand linguistic patterns, and they can be used to make predictions about new language data.

### What do models tell us?

In [None]:
# Descriptive models capture patterns in the data but they don't provide any information about why the data contains those patterns.
# In contrast, explanatory models attempt to capture properties and relationships that cause the linguistic patterns.

# In summary, descriptive models provide information about correlations in the data, while explanatory models go further to postulate causal relationships.