In [20]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("D:\\Projects\\ARI5121_NLP\\75\\data\\n3_collection_master\\reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [21]:
import nltk
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [22]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [23]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

# Submit training data to the trainer
print(X_train[0])
print(y_train[0])
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('prototype_4.model')

[['bias', 'word.lower=west', 'word[-3:]=est', 'word[-2:]=st', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NNP', 'BOS', '+1:word.lower=germanys', '+1:word.istitle=True', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=NNP'], ['bias', 'word.lower=germanys', 'word[-3:]=nys', 'word[-2:]=ys', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NNP', '-1:word.lower=west', '-1:word.istitle=True', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=NNP', '+1:word.lower=total', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=JJ'], ['bias', 'word.lower=total', 'word[-3:]=tal', 'word[-2:]=al', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=JJ', '-1:word.lower=germanys', '-1:word.istitle=True', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=NNP', '+1:word.lower=net', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:post

0....1....2....3....4....5....6....7....8....9....10
Number of features: 14425
Seconds required: 0.072

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5881.908120
Feature norm: 1.000000
Error norm: 6404.341733
Active features: 13993
Line search trials: 1
Line search step: 0.000039
Seconds required for this iteration: 0.018

***** Iteration #2 *****
Loss: 4782.091977
Feature norm: 0.850093
Error norm: 5757.284758
Active features: 14041
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #3 *****
Loss: 4362.540064
Feature norm: 0.811332
Error norm: 13465.640974
Active features: 9200
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.018

***** Iteration #4 *****
Loss: 2659.112037
Feature norm: 0.919782
Error norm: 2839.409243
Active f

***** Iteration #98 *****
Loss: 235.236538
Feature norm: 46.235923
Error norm: 8.329072
Active features: 1907
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #99 *****
Loss: 235.194522
Feature norm: 46.245677
Error norm: 5.628109
Active features: 1907
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #100 *****
Loss: 235.163602
Feature norm: 46.279617
Error norm: 9.968572
Active features: 1907
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #101 *****
Loss: 235.114577
Feature norm: 46.293366
Error norm: 10.007253
Active features: 1908
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #102 *****
Loss: 235.086911
Feature norm: 46.325185
Error norm: 12.378794
Active features: 1908
Line search trials: 1
Line search step: 1.000000
Seconds required for thi