In [72]:
import os
import re
import numpy
import datetime
import nltk
# To print all outputs, not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Most frequent tagging , NLTK tagging

In [82]:
class BrownCorpus(object):
    def __init__(self,dirname):
        self.dirname = dirname
        self.pos_ct = {}
        self.final_pos = {}
        self.final_ct = {}
        total = len(os.listdir(self.dirname))
        self.train_size = int(total*0.8)
        self.test_size = total-self.train_size
    def compute_pos_ct(self):
        for indx,fname in enumerate(os.listdir(self.dirname)):
            # Run on training set
            if indx>=self.train_size:
                break
            if re.match(r'c.+',fname)==None:
                continue
            fname = os.path.join(self.dirname,fname)
            if not os.path.isfile(fname):
                continue
            for line in open(fname):
                token_tags = [t.split('/') for t in line.split() if len(t.split('/'))==2]
                for token,tag in token_tags:
                    token = str(token).lower()
                    try:
                        self.pos_ct[(token,tag)] +=1
                    except:
                        self.pos_ct[(token,tag)] = 1
    def build_pos_oracle(self):
        for token_tag in self.pos_ct:
            ct = self.pos_ct[token_tag]
            token = token_tag[0]
            tag = token_tag[1]
            try:
                if self.final_ct[token]<ct:
                    self.final_pos[token]=tag
                    self.final_ct[token]=ct
            except:
                self.final_pos[token]=tag
                self.final_ct[token]=ct
    def get_accuracy(self):
        total_tokens = 0
        error_ct = 0
        for indx,fname in enumerate(os.listdir(self.dirname)):
            if indx<self.train_size:
                continue
            if re.match(r'c.+',fname)==None:
                continue
            fname = os.path.join(self.dirname,fname)
            if not os.path.isfile(fname):
                continue
            for line in open(fname):
                token_tags = [t.split('/') for t in line.split() if len(t.split('/'))==2]
                total_tokens += len(token_tags)
                for token,tag in token_tags:
                    token = str(token).lower()
                    try:
                        if tag!=self.final_pos[token]:
                            error_ct+=1
                    except:
                        pass
        return 1-error_ct/float(total_tokens)
    def nltk_tagger_accuracy(self):
        total_tokens = 0
        error_ct = 0
        for indx,fname in enumerate(os.listdir(self.dirname)):
            if indx<self.train_size:
                continue
            if re.match(r'c.+',fname)==None:
                continue
            fname = os.path.join(self.dirname,fname)
            if not os.path.isfile(fname):
                continue
            for line in open(fname):
                token_tags = [nltk.tag.str2tuple(t) for t in line.split() if len(t.split('/'))==2]
                total_tokens += len(token_tags)
                tokens = [tt[0] for tt in token_tags]
                tokens_txt = " ".join(tokens)
                nltk_tags = nltk.pos_tag(tokens_txt)
                for i in range(len(token_tags)):
                    nltk_tag = nltk_tags[i][1]
                    given_tag = token_tags[i][1]
                    if nltk_tag.lower()[:2] != given_tag.lower()[:2]:
                        error_ct += 1
        return 1-error_ct/float(total_tokens)

In [83]:
brown = BrownCorpus('/home/debojyoti/brown/')
brown.compute_pos_ct()
brown.build_pos_oracle()
brown.get_accuracy()
brown.nltk_tagger_accuracy()

2016-10-16 21:05:47.238152
2016-10-16 21:05:48.028085
2016-10-16 21:05:48.075190


0.8422427445175061

## NLTK tagger

In [71]:
import nltk


[(u'The', u'AT'),
 (u'Fulton', u'NP-TL'),
 (u'County', u'NN-TL'),
 (u'Grand', u'JJ-TL'),
 (u'Jury', u'NN-TL'),
 (u'said', u'VBD'),
 (u'Friday', u'NR'),
 (u'an', u'AT'),
 (u'investigation', u'NN'),
 (u'of', u'IN')]