In [1]:
import nltk
import nltk.tag.hmm as hmm
from nltk.corpus import conll2002
nltk.download('conll2002')
import spacy
from spacy.tokenizer import Tokenizer
import pandas as pd
from sklearn_crfsuite import CRF
from conll import evaluate
import es_core_news_sm
nlp = es_core_news_sm.load()

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

nlp = spacy.load("en_core_web_sm")
print("\033[1mconll2002 Corpus\033[0m")
print(conll2002.iob_sents('esp.train')[0])

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\farih\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


[1mconll2002 Corpus[0m
[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [2]:


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

nlp.tokenizer = Tokenizer(nlp.vocab)  # to use white space tokenization (generally a bad idea for unknown data)
def sent2spacy_features(sent):
    spacy_sent = nlp(" ".join(sent2tokens(sent)))
    feats = []
    for token in spacy_sent:
        token_feats = {
            'bias': 1.0,
            'word.lower()': token.lower_,
            'pos': token.pos_,
            'lemma': token.lemma_
        }
        feats.append(token_feats)
    
    return feats



In [3]:
# let's get only word and iob-tag
trn_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
trn_label = [sent2labels(s) for s in trn_sents]
trn_feats = [sent2spacy_features(s) for s in trn_sents]

tst_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
tst_feats = [sent2spacy_features(s) for s in tst_sents]

crf = CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

try:
    crf.fit(trn_feats, trn_label)
except AttributeError:
    pass

pred = crf.predict(tst_feats)

hyp = [[(tst_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred)]

print("\033[1mBaseline using the fetures in sent2spacy_features:\033[0m")
results = evaluate(tst_sents, hyp)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

[1mBaseline using the fetures in sent2spacy_features:[0m


Unnamed: 0,p,r,f,s
ORG,0.792,0.541,0.643,1700
MISC,0.645,0.371,0.471,445
LOC,0.714,0.747,0.73,985
PER,0.849,0.452,0.59,1222
total,0.766,0.545,0.637,4352


In [4]:
# nlp = spacy.load("es_core_news_sm")
nlp.tokenizer = Tokenizer(nlp.vocab)  # to use white space tokenization (generally a bad idea for unknown data)

def sent2spacy_features_with_suffix(sent):
    spacy_sent = nlp(" ".join(sent2tokens(sent)))
    feats = []
    for token in spacy_sent:
        token_feats = {
            'bias': 1.0,
            'word.lower()': token.lower_,
            'pos': token.pos_,
            'lemma': token.lemma_,
            'suffix': token.suffix_
        }
        feats.append(token_feats)
    
    return feats

In [5]:
# let's get only word and iob-tag
trn_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
trn_label = [sent2labels(s) for s in trn_sents]
trn_feats = [sent2spacy_features_with_suffix(s) for s in trn_sents]

tst_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
tst_feats = [sent2spacy_features_with_suffix(s) for s in tst_sents]

crf = CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

try:
    crf.fit(trn_feats, trn_label)
except AttributeError:
    pass

pred = crf.predict(tst_feats)

hyp = [[(tst_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred)]

print("\033[1mAdd the 'suffix' feature:\033[0m")
results = evaluate(tst_sents, hyp)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

[1mAdd the 'suffix' feature:[0m


Unnamed: 0,p,r,f,s
ORG,0.794,0.549,0.65,1700
MISC,0.589,0.38,0.462,445
LOC,0.699,0.744,0.721,985
PER,0.847,0.552,0.669,1222
total,0.759,0.577,0.656,4352


In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [7]:
# let's get only word and iob-tag
trn_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
trn_label = [sent2labels(s) for s in trn_sents]
trn_feats = [sent2spacy_features_with_suffix(s) for s in trn_sents]

tst_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
tst_feats = [sent2spacy_features_with_suffix(s) for s in tst_sents]


X_train = [sent2features(s) for s in trn_sents]
y_train = [sent2labels(s) for s in trn_sents]

X_test = [sent2features(s) for s in tst_sents]
y_test = [sent2labels(s) for s in tst_sents]


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


pred = crf.predict(X_test)

hyp = [[(tst_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred)]

print("\033[1mAdd all the features used in the tutorial on CoNLL dataset:\033[0m")
results = evaluate(tst_sents, hyp)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

[1mAdd all the features used in the tutorial on CoNLL dataset:[0m


Unnamed: 0,p,r,f,s
ORG,0.785,0.728,0.755,1700
MISC,0.556,0.458,0.502,445
LOC,0.638,0.79,0.706,985
PER,0.873,0.775,0.821,1222
total,0.745,0.728,0.736,4352


In [8]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > -1:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent):
        word1 = sent[i][0]
        postag1 = sent[i][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [9]:
# let's get only word and iob-tag
trn_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
trn_label = [sent2labels(s) for s in trn_sents]
trn_feats = [sent2spacy_features_with_suffix(s) for s in trn_sents]

tst_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
tst_feats = [sent2spacy_features_with_suffix(s) for s in tst_sents]


X_train = [sent2features(s) for s in trn_sents]
y_train = [sent2labels(s) for s in trn_sents]

X_test = [sent2features(s) for s in tst_sents]
y_test = [sent2labels(s) for s in tst_sents]


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


pred = crf.predict(X_test)

hyp = [[(tst_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred)]

print("\033[1mfeature window [-1, +1]:\033[0m")
results = evaluate(tst_sents, hyp)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

[1mfeature window [-1, +1]:[0m


Unnamed: 0,p,r,f,s
ORG,0.791,0.723,0.756,1700
MISC,0.591,0.512,0.549,445
LOC,0.663,0.804,0.727,985
PER,0.873,0.81,0.84,1222
total,0.759,0.744,0.752,4352


In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > -2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)+1:
        word1 = sent[i][0]
        postag1 = sent[i][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
trn_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
trn_label = [sent2labels(s) for s in trn_sents]
trn_feats = [sent2spacy_features_with_suffix(s) for s in trn_sents]

tst_sents = [[(text, pos, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
tst_feats = [sent2spacy_features_with_suffix(s) for s in tst_sents]


X_train = [sent2features(s) for s in trn_sents]
y_train = [sent2labels(s) for s in trn_sents]

X_test = [sent2features(s) for s in tst_sents]
y_test = [sent2labels(s) for s in tst_sents]


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


pred = crf.predict(X_test)

hyp = [[(tst_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred)]

print("\033[1mfeature window [-2, +2]:\033[0m")
results = evaluate(tst_sents, hyp)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

[1mfeature window [-2, +2]:[0m


Unnamed: 0,p,r,f,s
ORG,0.791,0.723,0.756,1700
MISC,0.591,0.512,0.549,445
LOC,0.663,0.804,0.727,985
PER,0.873,0.81,0.84,1222
total,0.759,0.744,0.752,4352
