# Gender Classification 

In [1]:
import nltk
nltk.download("names")
from nltk.corpus import names

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [2]:
m = names.words('male.txt')
f = names.words('female.txt')

In [3]:
import random
random.seed(1234) # Set the seed to facilitate replicability
names = ([(name, 'male') for name in m] +
         [(name, 'female') for name in f])
random.shuffle(names)
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [4]:
def one_hot_character(c):
  alphabet = 'abcdefghijklmnopqrstuvwxyz'
  result = [0]*len(alphabet)
  try:
     result[alphabet.index(c.lower())] = 1
  except:
     print("Warning: unable to encode character '%c'" % c)
  return result

In [5]:
one_hot_character('a')

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [6]:
one_hot_character('c')

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [7]:
def gender_features(word):
    last = one_hot_character(word[-1])
    secondlast = one_hot_character(word[-2])
    return secondlast + last

In [8]:
gender_features('Mary')

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0]

In [9]:
from sklearn.naive_bayes import MultinomialNB
train_set = [(gender_features(n), g) for n, g in train_names]
devtest_set = [(gender_features(n), g) for n, g in devtest_names]
test_set = [(gender_features(n), g) for n, g in test_names]
train_X,train_y = zip(*train_set)
classifier = MultinomialNB()
classifier.fit(train_X, train_y)



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
from sklearn.metrics import accuracy_score
devtest_X,devtest_y = zip(*devtest_set)
devtest_predictions = classifier.predict(devtest_X)
accuracy_score(devtest_y, devtest_predictions)

0.752

Scikit-Learn provides LabelBinarizer to encode labels using one-hot encoding:

In [11]:
from sklearn import preprocessing
alphabet = 'abcdefghijklmnopqrstuvwxyz'
lb = preprocessing.LabelBinarizer()
lb.fit([c for c in alphabet])
def one_hot_character2(c):
    return list(lb.transform([c])[0])

In [12]:
one_hot_character2('c')

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [13]:
def gender_features2(word):
    last = one_hot_character2(word[-1])
    secondlast = one_hot_character2(word[-2])
    return secondlast + last

In [14]:
gender_features2("Mary")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0]

In [15]:
train_set2 = [(gender_features2(n), g) for n, g in train_names]
devtest_set2 = [(gender_features2(n), g) for n, g in devtest_names]
test_set2 = [(gender_features2(n), g) for n, g in test_names]
train_X2,train_y2 = zip(*train_set2)
classifier2 = MultinomialNB()
classifier2.fit(train_X2, train_y2)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
devtest_X2,devtest_y2 = zip(*devtest_set2)
devtest_predictions2 = classifier.predict(devtest_X2)
accuracy_score(devtest_y2, devtest_predictions2)

0.752

# Movie Reviews

In [17]:
import nltk
nltk.download("movie_reviews")
from nltk.corpus import movie_reviews
movie_reviews.categories()

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


['neg', 'pos']

In [18]:
print("Number of negative reviews:", len(movie_reviews.fileids('neg')))
print("Number of positive reviews:", len(movie_reviews.fileids('pos')))

Number of negative reviews: 1000
Number of positive reviews: 1000


The following code partitions the movie review corpus into a training and a test set.

In [19]:
import random
documents_words = [(list(movie_reviews.words(fileid)), category)
                   for category in movie_reviews.categories()
                   for fileid in movie_reviews.fileids(category)]
random.seed(1234)
random.shuffle(documents_words)
threshold1 = int(len(documents_words)*.6)
threshold2 = int(len(documents_words)*.8)
train_words = documents_words[:threshold1]
devtest_words = documents_words[threshold1:threshold2]
test_words = documents_words[threshold2:]

The following code finds the 2000 most frequent non-stop words.

In [20]:
import collections
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')
c = collections.Counter([w.lower() for (words, category) in train_words 
                                   for w in words if w.lower() not in stop])
top2000words = [w for (w, count) in c.most_common(2000)]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Using NLTK

The following code implements one-hot encoding with the 2000 most frequent words.

In [21]:
def document_features(words):
    "Return the document features for an NLTK classifier"
    words_lower = [w.lower() for w in words]
    result = dict()
    for w in top2000words:
        result['has(%s)' % w] = (w in words_lower)
    return result

And here we train an NLTK Naive Bayes classifier using the training set, and evaluate the system using the devtest set.

In [22]:
train_features = [(document_features(x), y) for (x, y) in train_words]
devtest_features = [(document_features(x), y) for (x, y) in devtest_words]
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [23]:
nltk.classify.accuracy(classifier, devtest_features)

0.7775

In [24]:
nltk.classify.accuracy(classifier, train_features)

0.8816666666666667

We can see the difference in accuracy between the test set and the train set.

## Using Scikit-learn

The following code defines a second feature extractor on the same list of 2000 words, and which is suitable for sklearn.

In [25]:
def vector_features(words):
    "Return a vector of features for sklearn"
    words_lower = [w.lower() for w in words]
    result = []
    for w in top2000words:
        if w in words_lower:
            result.append(1)
        else:
            result.append(0)
    return result

Below is the code that generates the vectors, trains a Multinomial Naive Bayes classifier, and evaluates the result. 

In [26]:
train_vectors = [vector_features(x) for (x, y) in train_words]
train_labels = [y for (x, y) in train_words]
devtest_vectors = [vector_features(x) for (x, y) in devtest_words]
devtest_labels = [y for (x, y) in devtest_words]

In [27]:
from sklearn.naive_bayes import MultinomialNB
sklearn_classifier = MultinomialNB()
sklearn_classifier.fit(train_vectors, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
predictions = sklearn_classifier.predict(devtest_vectors)
accuracy_score(devtest_labels, predictions)

0.83999999999999997

In [30]:
predictions = sklearn_classifier.predict(train_vectors)
accuracy_score(train_labels, predictions)

0.92000000000000004

## tf-idf as features in sklearn

In [31]:
documents_raw = [(movie_reviews.raw(fileid), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
random.seed(1234)
random.shuffle(documents_raw)
threshold1 = int(len(documents_raw)*.6)
threshold2 = int(len(documents_raw)*.8)
train_raw = documents_raw[:threshold1]
train_labels_raw = [y for x, y in train_raw]
devtest_raw = documents_raw[threshold1:threshold2]
devtest_labels_raw = [y for x, y in devtest_raw]
test_raw = documents_raw[threshold2:]
test_labels_raw = [y for x, y in test_raw]

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
train_features = tfidf.fit_transform([x for x, y in train_raw])

sklearn_classifier2 = MultinomialNB()
sklearn_classifier2.fit(train_features, train_labels_raw)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
devtest_features = tfidf.transform([x for x, y in devtest_raw])
predictions = sklearn_classifier2.predict(devtest_features)
accuracy_score(devtest_labels_raw, predictions)

0.81000000000000005

In [34]:
predictions = sklearn_classifier2.predict(train_features)
accuracy_score(train_labels_raw, predictions)

0.98083333333333333

And below is the code that uses Support Vector Machines (SVM) instead. You can see that the interface is the same. SVMs typically give very good results, especially when the amount of training data is large enough (in this case it wasn't).

In [35]:
from sklearn.svm import SVC
sklearn_classifier3 = SVC()
sklearn_classifier3.fit(train_features, train_labels_raw)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
predictions3 = sklearn_classifier3.predict(devtest_features)
accuracy_score(devtest_labels_raw, predictions3)

0.51500000000000001

In [37]:
predictions3 = sklearn_classifier3.predict(train_features)
accuracy_score(train_labels_raw, predictions3)

0.50083333333333335

## 10-fold Cross Validation using sklearn

In [38]:
from sklearn.model_selection import cross_val_score
crossval_classifier = MultinomialNB()
scores = cross_val_score(crossval_classifier, train_features, train_labels_raw, cv=10, scoring="accuracy")
scores

array([ 0.79338843,  0.80833333,  0.76666667,  0.79166667,  0.80833333,
        0.79166667,  0.825     ,  0.83333333,  0.79166667,  0.80672269])

In [39]:
print("Mean of accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

Mean of accuracy: 0.801677778549
Standard deviation of accuracy: 0.018042179595


In [40]:
import numpy as np
from sklearn.model_selection import KFold
fold = 0
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
train_labels_array = np.array(train_labels_raw)
for kv_train, kv_test in kf.split(train_raw):
    # kv_train and kv_test are indices of array dev_vectors
    print("Fold %i:" % fold)
    fold += 1
    cv_classifier = MultinomialNB()
    cv_classifier.fit(train_features[kv_train], train_labels_array[kv_train])
    test_predictions = cv_classifier.predict(train_features[kv_test])
    test_accuracy = accuracy_score(train_labels_array[kv_test], test_predictions)
    print("Accuracy: %.3f" % test_accuracy)

Fold 0:
Accuracy: 0.683
Fold 1:
Accuracy: 0.792
Fold 2:
Accuracy: 0.817
Fold 3:
Accuracy: 0.717
Fold 4:
Accuracy: 0.833
Fold 5:
Accuracy: 0.750
Fold 6:
Accuracy: 0.833
Fold 7:
Accuracy: 0.850
Fold 8:
Accuracy: 0.808
Fold 9:
Accuracy: 0.800


# Sentence Segmentation

The following code splits the Brown corpus into a training and test set. **Note that now we cannot shuffle the sentences since we will need information from text from previous and following sentences.**

In [41]:
nltk.download("brown")
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [42]:
sents = brown.sents(categories='news')
size = int(len(sents)*0.1)
train_sents, test_sents = sents[size:], sents[:size]

The following code extracts the boundary information of tokenised sentences. This can be used for our annotations.

In [43]:
def extract_boundaries(sents):
    """Return the tokens and the sentence boundary positions"""
    tokens = []
    boundaries = []
    offset = 0
    for sent in sents:
        tokens.extend(sent)
        offset += len(sent)
        boundaries.append(offset-1)
    return tokens, boundaries

In [44]:
train_tokens, train_boundaries = extract_boundaries(train_sents)
test_tokens, test_boundaries = extract_boundaries(test_sents)

In [45]:
train_tokens[:50]

['He',
 'assured',
 'Mr.',
 'Martinelli',
 'and',
 'the',
 'council',
 'that',
 'he',
 'would',
 'study',
 'the',
 'correct',
 'method',
 'and',
 'report',
 'back',
 'to',
 'the',
 'council',
 'as',
 'soon',
 'as',
 'possible',
 '.',
 'Mr.',
 'Martinelli',
 'said',
 'yesterday',
 'that',
 'the',
 'Citizens',
 'Group',
 'of',
 'Johnston',
 'will',
 'meet',
 'again',
 'July',
 '24',
 'to',
 'plan',
 'further',
 'strategy',
 'in',
 'the',
 'charter',
 'movement',
 '.',
 'He']

In [46]:
train_boundaries[:10]

[24, 48, 77, 96, 115, 149, 181, 202, 239, 252]

In [47]:
train_tokens[21:26]

['soon', 'as', 'possible', '.', 'Mr.']

We now define context-based features for all tokens that are candidates to sentence endings.

In [48]:
def segmenter_features(tokens, i):
    """Return the features of token[i]"""
    return {'next-word-capitalized': 
                  tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': 
                  len(tokens[i-1]) == 1}   

With the tokens, boundaries, and feature extractor, we can prepare the training and test sets.

In [49]:
candidates = '.?!'
train_features = [(segmenter_features(train_tokens, i), 
                   (i in train_boundaries))
                 for i in range(1, len(train_tokens)-1)
                 if train_tokens[i] in candidates]
test_features = [(segmenter_features(test_tokens, i), 
                  (i in test_boundaries))
                 for i in range(1, len(test_tokens)-1)
                 if test_tokens[i] in candidates]


In [50]:
train_features[:3]

[({'next-word-capitalized': True,
   'prev-word': 'possible',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True),
 ({'next-word-capitalized': True,
   'prev-word': 'movement',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True),
 ({'next-word-capitalized': False,
   'prev-word': 'comes',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True)]

In [51]:
len(train_features), len(test_features)

(3749, 407)

Now we can train a classifier that can be used for sentence segmentation

In [52]:
segmenter = nltk.NaiveBayesClassifier.train(train_features)
nltk.classify.accuracy(segmenter, test_features)

1.0

Looks impressive! but let's check what would happen if we introduced a majority baseline classifier.

In [53]:
from collections import Counter
train_counter = Counter([f[1] for f in train_features])
train_counter

Counter({False: 62, True: 3687})

Since most training samples are labelled as `True,` the majority baseline is a classifier that always outputs `True`. In that case, accuracy in the test set is:

In [54]:
test_counter = Counter([f[1] for f in test_features])
test_counter

Counter({False: 2, True: 405})

In [55]:
405/407

0.995085995085995

So, the majority baseline was not that impressive after all. The finished segmenter that uses the trained classifier is:

In [56]:
def segment_sentences(tokens):
    """Segment a list of tokens"""
    start = 0
    sents = []
    for i, token in enumerate(tokens):
        if token in candidates and \
           segmenter.classify(segmenter_features(tokens, i)) == True:
               sents.append(tokens[start:i+1])
               start = i+1
    if start < len(tokens):
        sents.append(tokens[start:])
    return sents

In [57]:
segment_sentences(["This", "is", "a", "sentence", ".", "This", 
                    "is", "another", "one"])

[['This', 'is', 'a', 'sentence', '.'], ['This', 'is', 'another', 'one']]

# The Reuters-21578 Corpus
The Reuters-21578 is an example of a corpus for multi-label text classification. Each news item in the corpus has been annotated with the labels of all the topics for which the news item is relevant. This means that one news item could have several labels, or even none at all.

In [58]:
import nltk
nltk.download("reuters")
from nltk.corpus import reuters

[nltk_data] Downloading package reuters to /root/nltk_data...


In [59]:
reuters.categories()

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc']

The following code shows the list of files that have the label "corn".

In [60]:
reuters.fileids(categories='corn')

['test/14832',
 'test/14858',
 'test/15033',
 'test/15043',
 'test/15106',
 'test/15287',
 'test/15341',
 'test/15618',
 'test/15648',
 'test/15676',
 'test/15686',
 'test/15720',
 'test/15845',
 'test/15856',
 'test/15860',
 'test/15863',
 'test/15871',
 'test/15875',
 'test/15877',
 'test/15890',
 'test/15904',
 'test/15906',
 'test/15910',
 'test/15911',
 'test/15917',
 'test/15952',
 'test/15999',
 'test/16012',
 'test/16071',
 'test/16099',
 'test/16147',
 'test/16525',
 'test/16624',
 'test/16751',
 'test/16765',
 'test/17503',
 'test/17509',
 'test/17722',
 'test/18035',
 'test/18482',
 'test/18614',
 'test/18954',
 'test/18973',
 'test/19165',
 'test/19721',
 'test/19821',
 'test/20018',
 'test/20366',
 'test/20637',
 'test/20645',
 'test/20649',
 'test/20723',
 'test/20763',
 'test/21091',
 'test/21243',
 'test/21493',
 'training/10120',
 'training/10139',
 'training/10172',
 'training/10175',
 'training/10319',
 'training/10339',
 'training/10487',
 'training/10489',
 'traini

### Split data for three independent classifiers
We will encode the labels of each document as a list of Boolean elements that indicate whether the document has a label or not.

In [61]:
corn_fileids = reuters.fileids(categories='corn')
gold_fileids = reuters.fileids(categories='gold')
grain_fileids = reuters.fileids(categories='grain')

training_fileids = [f for f in reuters.fileids() if f[0:8]=='training']
test_fileids = [f for f in reuters.fileids() if f[0:4]=='test']

training_set = [(f, (f in corn_fileids, f in gold_fileids, f in grain_fileids)) for f in training_fileids]
test_set = [(f, (f in corn_fileids, f in gold_fileids, f in grain_fileids)) for f in test_fileids]

In [62]:
len(training_set), len(test_set)

(7769, 3019)

In [63]:
test_set[:10]

[('test/14826', (False, False, False)),
 ('test/14828', (False, False, True)),
 ('test/14829', (False, False, False)),
 ('test/14832', (True, False, True)),
 ('test/14833', (False, False, False)),
 ('test/14839', (False, False, False)),
 ('test/14840', (False, False, False)),
 ('test/14841', (False, False, True)),
 ('test/14842', (False, True, False)),
 ('test/14843', (False, False, False))]

### Extract features

In [64]:
import collections
all_words = collections.Counter(w.lower() \
        for w in reuters.words(fileids=training_fileids))
word_features = [w for (w, c) in all_words.most_common(500)]
word_features[:10]

['.', ',', 'the', 'to', 'of', 'in', 'and', 'said', 'a', 'mln']

In [65]:
def document_features(fileid):
    document_words = set([w.lower() for w in reuters.words(fileids=[fileid])])
    features = dict()
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

training_features = [(document_features(f),t) for (f,t) in training_set]
test_features = [(document_features(f),t) for (f,t) in test_set]

### Training

In [66]:
classifier_corn = nltk.NaiveBayesClassifier.train((f, t[0]) for f, t in training_features)
classifier_corn.show_most_informative_features(5)

Most Informative Features
          contains(corn) = True             True : False  =    182.5 : 1.0
             contains(&) = True            False : True   =     69.0 : 1.0
            contains(lt) = True            False : True   =     69.0 : 1.0
             contains(>) = True            False : True   =     67.5 : 1.0
             contains(;) = True            False : True   =     41.6 : 1.0


In [67]:
classifier_gold = nltk.NaiveBayesClassifier.train((f, t[1]) for f, t in training_features)
classifier_gold.show_most_informative_features(5)

Most Informative Features
          contains(gold) = True             True : False  =     94.9 : 1.0
           contains(the) = False           False : True   =     22.0 : 1.0
           contains(cts) = True            False : True   =     17.6 : 1.0
            contains(vs) = True            False : True   =     16.7 : 1.0
           contains(net) = True            False : True   =     15.8 : 1.0


In [68]:
classifier_grain = nltk.NaiveBayesClassifier.train((f, t[2]) for f, t in training_features)
classifier_grain.show_most_informative_features(5)

Most Informative Features
         contains(wheat) = True             True : False  =    233.1 : 1.0
             contains(>) = True            False : True   =    166.6 : 1.0
         contains(grain) = True             True : False  =    108.3 : 1.0
          contains(corn) = True             True : False  =    106.3 : 1.0
             contains(&) = True            False : True   =    102.2 : 1.0


### Testing the Accuracy

In [71]:
nltk.classify.accuracy(classifier_corn, [(f, t[0]) for f, t in test_features])

0.8509440211990725

In [72]:
nltk.classify.accuracy(classifier_gold, [(f, t[1]) for f, t in test_features])

0.9744948658496191

In [73]:
nltk.classify.accuracy(classifier_grain, [(f, t[2]) for f, t in test_features])

0.8373633653527658

### Macro-averaged Evaluation of F1

In [74]:
def f1(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(y_true)):
        if y_true[i] == True:
            if y_pred[i] == True:
                tp += 1
            else:
                fn += 1
        elif y_pred[i] == True:
            fp += 1
        else:
            tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [75]:
predictions_corn = [classifier_corn.classify(f) for f, t in test_features]
predictions_corn[:10]

[False, True, False, True, True, True, True, True, False, False]

In [76]:
true_corn = [t[0] for f, t in test_features]
true_corn[:10]

[False, False, False, True, False, False, False, False, False, False]

In [77]:
predictions_gold = [classifier_gold.classify(f) for f, t in test_features]
predictions_grain = [classifier_grain.classify(f) for f, t in test_features]
true_gold = [t[1] for f, t in test_features]
true_grain = [t[2] for f, t in test_features]

In [78]:
totalf1 = 0
thef1 = f1(true_corn, predictions_corn)
print('corn f1:', thef1)
totalf1 += thef1
thef1 = f1(true_gold, predictions_gold)
print('gold f1:', thef1)
totalf1 += thef1
thef1 = f1(true_grain, predictions_grain)
print('grain f1:', thef1)
totalf1 += thef1
print("Macro-average f1: %1.4f" % (totalf1/3))

corn f1: 0.18181818181818182
gold f1: 0.3636363636363636
grain f1: 0.3598435462842242
Macro-average f1: 0.3018


### Micro-averaged Evaluation

In [79]:
def f1_micro(y_true, y_pred):
    """y_true[i] is the list of true annotations for label i
       y_pred[i] is the list of predictions for label i"""
    assert len(y_true) == len(y_pred)
    tp, tn, fp, fn = 0, 0, 0, 0
    for label in range(3):
        for i in range(len(y_true[label])):
            if y_true[label][i] == True:
                if y_pred[label][i] == True:
                    tp += 1
                else:
                    fn += 1
            elif y_pred[label][i] == True:
                fp += 1
            else:
                tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [80]:
print("Micro-average f1: %1.4f" % f1_micro((true_corn, true_gold, true_grain),
                                           (predictions_corn, predictions_gold, predictions_grain)))

Micro-average f1: 0.2921
