In [1]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from nltk.corpus import treebank, brown, conll2000, reuters
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [2]:
nltk.download('punkt')
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('reuters')
tagged_treebank = treebank.tagged_sents()
tagged_brown = brown.tagged_sents()
tagged_conll2000 = conll2000.tagged_sents()
reuters_files = reuters.fileids()

# Tokenize the raw text into sentences and apply POS tagging
tagged_sentences_reuters = []
for file_id in reuters_files:
    raw_text = reuters.raw(file_id)
    sentences = sent_tokenize(raw_text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        tagged_sentence = pos_tag(words, tagset='universal')
        tagged_sentences_reuters.append(tagged_sentence)
        
# print(treebank)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\blkeu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\blkeu\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\blkeu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\blkeu\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\blkeu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [3]:
def word_features(sentence, i):
    word = sentence[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],   
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sentence[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

In [4]:
def feature_extract(corpus):
    X = []
    y = []
    for sentence in corpus:
    	X_sentence = []
    	y_sentence = []
    	for i in range(len(sentence)):
    		X_sentence.append(word_features(sentence, i))
    		y_sentence.append(sentence[i][1])
    	X.append(X_sentence)
    	y.append(y_sentence)
    
    
    # Split the data into training and testing sets
    split = int(0.8 * len(X))
    X_train = X[:split]
    y_train = y[:split]
    X_test = X[split:]
    y_test = y[split:]
    return X_train, y_train, X_test, y_test

In [5]:
def train_and_pred(X_train, y_train, X_test, y_test):
    crf = sklearn_crfsuite.CRF(
    	algorithm='lbfgs',
    	c1=0.1,
    	c2=0.1,
    	max_iterations=100,
    	all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    
    # Make predictions on the test data and evaluate the performance
    y_pred = crf.predict(X_test)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_test, y_pred)}")

In [6]:
def pipeline(corpus):
    print(f"CRF {corpus}\n\n")
    X_train, y_train, X_test, y_test = feature_extract(corpus)
    train_and_pred(X_train, y_train, X_test, y_test)

In [7]:
pipeline(tagged_treebank)
# pipeline(tagged_brown)
pipeline(tagged_conll2000)
pipeline(tagged_sentences_reuters)

CRF [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


Accuracy: 0.9632716203403363
CRF [[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'),

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Accuracy: 0.9832842396121162


In [None]:
%%timeit
pipeline(tagged_brown)

CRF [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('At

In [None]:
# ^ 5:20PM

In [10]:
# import pycrfsuite

# # Train a CRF model suing pysrfsuite
# trainer = pycrfsuite.Trainer(verbose=False)
# for x, y in zip(X_train, y_train):
# 	trainer.append(x, y)
# trainer.set_params({
# 	'c1': 1.0,
# 	'c2': 1e-3,
# 	'max_iterations': 50,
# 	'feature.possible_transitions': True
# })
# trainer.train('pos.crfsuite')

# # Tag a new sentence
# tagger = pycrfsuite.Tagger()
# tagger.open('pos.crfsuite')
# sentence = 'Geeksforgeeks is a best platform for students.'.split()
# features = [word_features(sentence, i) for i in range(len(sentence))]
# tags = tagger.tag(features)
# print(list(zip(sentence, tags)))
