In [3]:
import nltk
import sklearn_crfsuite
import eli5

## load data & feature extraction

In [4]:
from trainCRF_NLTK import split_train_test_set
%timeit
(X_train, y_train), (X_test, y_test) = split_train_test_set()

## Train a CRF model

In [5]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train);

### Inspect model weights!

In [7]:
eli5.show_weights(crf, top=30)

From \ To,I,O
I,0.0,-0.236
O,-0.04,0.358

Weight?,Feature
Weight?,Feature
+1.471,-1:word.lower=for
+1.452,-1:word.lower=word
+1.087,-1:word.lower=saying
+0.880,-1:word.lower=say
+0.879,-1:postag=VBG
+0.658,-1:postag=``
+0.658,-1:word.lower=``
+0.575,-1:word.lower=of
+0.543,+1:word.lower=or
+0.514,postag=JJ

Weight?,Feature
+1.471,-1:word.lower=for
+1.452,-1:word.lower=word
+1.087,-1:word.lower=saying
+0.880,-1:word.lower=say
+0.879,-1:postag=VBG
+0.658,-1:postag=``
+0.658,-1:word.lower=``
+0.575,-1:word.lower=of
+0.543,+1:word.lower=or
+0.514,postag=JJ

Weight?,Feature
+2.171,BOS
+1.817,postag=.
+1.319,word[-3:]=.
+1.319,word.lower=.
+1.319,word[-2:]=.
+0.836,+1:postag=NNP
+0.828,word.lower=a
+0.627,-1:postag=CC
+0.597,-1:postag=JJ
+0.593,-1:postag=NNS


If we `regularize CRF more`, we can expect that ```only features which are generic will remain```, and ```memoized tokens will go```. With L1 regularization (c1 parameter) coefficients of most features should be driven to zero. Let’s check what effect does regularization have on CRF weights:

In [8]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=30)

From \ To,I,O
I,0.0,0.0
O,0.0,1.125

Weight?,Feature
Weight?,Feature
+0.320,-1:word.lower=``
+0.320,-1:postag=``
+0.289,postag=NN
-0.084,+1:word.isupper=False
-0.112,word.istitle=False
-0.126,+1:word.isdigit=False
-0.165,word.isupper=False
-0.228,word.isdigit=False
-0.253,bias
+0.253,bias

Weight?,Feature
0.32,-1:word.lower=``
0.32,-1:postag=``
0.289,postag=NN
-0.084,+1:word.isupper=False
-0.112,word.istitle=False
-0.126,+1:word.isdigit=False
-0.165,word.isupper=False
-0.228,word.isdigit=False
-0.253,bias

Weight?,Feature
0.253,bias
0.228,word.isdigit=False
0.165,word.isupper=False
0.126,+1:word.isdigit=False
0.112,word.istitle=False
0.084,+1:word.isupper=False
-0.289,postag=NN
-0.32,-1:postag=``
-0.32,-1:word.lower=``


Memoized tokens are mostly gone and model now relies on word shapes and POS tags. There is only a few non-zero features remaining. In our example the change probably made the quality worse, but that’s a separate question.

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=True,# added
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,I,O
I,-0.623,-0.201
O,-0.022,0.263


## customize 

In [18]:
eli5.show_weights(crf, top=10, targets=['I'])

From \ To,I
I,-0.623

Weight?,Feature
+1.698,-1:word.lower=say
+1.515,-1:word.lower=for
+1.472,-1:word.lower=word
+1.173,-1:word.lower=saying
+1.018,-1:word.lower=spelling
+0.948,-1:word.lower=of
+0.933,-1:postag=VBG
+0.725,-1:postag=``
… 1174 more positive …,… 1174 more positive …
… 256 more negative …,… 256 more negative …


check how word shape features are used by model using `feature_re argument` and hide transition table

In [11]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
-0.21,word.istitle=True
-0.274,word.istitle=False
-0.296,word.isupper=False
-0.318,word.isdigit=False
-0.429,word.isupper=True

Weight?,Feature
0.663,word.isdigit=True
0.429,word.isupper=True
0.318,word.isdigit=False
0.296,word.isupper=False
0.274,word.istitle=False
0.21,word.istitle=True


###  Formatting in console

In [13]:
expl = eli5.explain_weights(crf, top=5, targets=['O','I'])
print(eli5.format_as_text(expl))

Explained as: CRF

Transition features:
         O       I
--  ------  ------
O    0.263  -0.022
I   -0.201  -0.623

y='O' top features
Weight  Feature          
------  -----------------
+2.803  BOS              
+2.376  postag=.         
+1.539  word.lower=a     
 … 2574 more positive …  
  … 791 more negative …  
-1.515  -1:word.lower=for
-1.698  -1:word.lower=say

y='I' top features
    … 1177 more positive …    
    … 258 more negative …     
Weight  Feature               
------  ----------------------
+1.698  -1:word.lower=say     
+1.515  -1:word.lower=for     
+1.472  -1:word.lower=word    
+1.173  -1:word.lower=saying  
+1.018  -1:word.lower=spelling

