In [None]:
from sklearn.model_selection import train_test_split
from src.dataset import load_training_data
import pycrfsuite
import json
from src.pipelines import SentenceChunker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Load and transform and split the dataset

In [None]:
# preserve
training_set = load_training_data()
training_set['real_label'] = training_set['real_label'].replace('f', 'n')



documents = []
current_doc = []
prev = -1
for i,word in training_set.iterrows():
    if i != prev:
        if current_doc:
            documents.append(current_doc)
        current_doc = []
    word_dictionary = word.to_dict()    
    word_dictionary['sentence_id'] = i
    current_doc.append(word_dictionary)
    prev = i

if current_doc:
    documents.append(current_doc)

print(json.dumps( documents[0][:1], indent=2 ))

train_docs, test_docs = train_test_split(documents)
print()
print(f'Training docs: {len(train_docs)}')
print(f'Testing docs: {len(test_docs)}')

[
  {
    "offer_len": 44,
    "token": "\u00a1",
    "loc": 0,
    "pos": "faa",
    "pos_left": "<p>",
    "pos_right": "np00000",
    "token_len": 1,
    "all_upper": false,
    "n_tokens": 11,
    "real_label": "n",
    "sentence_id": 0
  }
]

Training docs: 170
Testing docs: 57


## Extractor functions

In [None]:
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

def extract_labels(doc):
    return [doc[i]['real_label'] for i in range(len(doc))]

def extract_tokens(doc):
    return [doc[i]['token'] for i in range(len(doc))]

def is_numeric(token):
    try:
        float(token.replace(",", ""))
        return True
    except:
        return False
    

def word2features(doc, i):
    word = doc[i]['token']
    postag = doc[i]['pos']

    # Common features for all words. You may add more features here based on your custom use case
    features = [
            'bias',
#            'word.lower=' + word.lower(),
#            'word[-3:]=' + word[-3:],
#            'word[-2:]=' + word[-2:],
#            'word.isupper=%s' % word.isupper(),
#            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % is_numeric(word),
#            'word.location=%s' % doc[i]['loc'],
            'postag=' + postag
        ]

    # Features for words that are not at the beginning of a document
    if i > 0:
            word1 = doc[i-1]['token']
            postag1 = doc[i-1]['pos']
            features.extend([
#                '-1:word.lower=' + word1.lower(),
#                '-1:word.istitle=%s' % word1.istitle(),
#                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.isdigit=%s' % is_numeric(word1),
                '-1:postag=' + postag1
            ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not at the end of a document
    if i < len(doc)-1:
            word1 = doc[i+1]['token']
            postag1 = doc[i+1]['pos']
            features.extend([
#                '+1:word.lower=' + word1.lower(),
#                '+1:word.istitle=%s' % word1.istitle(),
#                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.isdigit=%s' % is_numeric(word1),
                '+1:postag=' + postag1
            ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [None]:
%%time
y_train = [extract_labels(s) for s in train_docs]
X_train = [extract_features(s) for s in train_docs]

y_test = [extract_labels(s) for s in test_docs]
X_test = [extract_features(s) for s in test_docs]

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainer.train('model.crfsuite')

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('model.crfsuite')

In [None]:
incorrect = 0
for i in range(len(test_docs)):
    example_sent = documents[i]
    predicted = tagger.tag(extract_features(example_sent))
    correct = extract_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = extract_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0]['sentence_id'], ' '.join(tokens))
        
        print('P:   ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('C:   ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(test_docs)}')

In [None]:
incorrect = 0
for i in range(len(documents)):
    example_sent = documents[i]
    predicted = tagger.tag(extract_features(example_sent))
    correct = extract_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = extract_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0]['sentence_id'], ' '.join(tokens))
        
        print('P:   ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('C:   ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(documents)}')