In [2]:
import nltk
import sklearn_crfsuite
import eli5

## load data & feature extraction

In [3]:
from trainCRF_NLTK import split_train_test_set
%timeit
(X_train, y_train), (X_test, y_test) = split_train_test_set()

'load_data' ((), {}) 3.03 sec


## Train a CRF model

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train);

### Inspect model weights!

In [8]:
eli5.show_sweights(crf, top=30)

From \ To,I,O
I,0.0,-0.288
O,0.094,0.345

Weight?,Feature
Weight?,Feature
+1.477,-1:word.lower=say
+1.386,-1:word.lower=for
+1.366,-1:word.lower=word
+1.129,-1:word.lower=saying
+0.952,-1:postag=VBG
+0.800,-1:postag=VB
+0.683,-1:word.lower=spelling
+0.662,-1:postag=``
+0.662,-1:word.lower=``
+0.617,-1:postag=IN

Weight?,Feature
+1.477,-1:word.lower=say
+1.386,-1:word.lower=for
+1.366,-1:word.lower=word
+1.129,-1:word.lower=saying
+0.952,-1:postag=VBG
+0.800,-1:postag=VB
+0.683,-1:word.lower=spelling
+0.662,-1:postag=``
+0.662,-1:word.lower=``
+0.617,-1:postag=IN

Weight?,Feature
+1.047,BOS
+0.956,postag=.
+0.710,+1:postag=NNP
+0.680,-1:postag=JJ
+0.663,-1:word.lower=in
+0.655,-1:postag=CC
+0.647,-1:postag=NNP
+0.619,word.lower=a
+0.580,postag=PRP
+0.573,postag=''


If we `regularize CRF more`, we can expect that ```only features which are generic will remain```, and ```memoized tokens will go```. With L1 regularization (c1 parameter) coefficients of most features should be driven to zero. Let’s check what effect does regularization have on CRF weights:

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=30)

From \ To,I,O
I,0.0,-0.011
O,0.0,0.962

Weight?,Feature
Weight?,Feature
+0.329,-1:word.lower=``
+0.329,-1:postag=``
+0.322,postag=NN
-0.106,word.istitle=False
-0.169,+1:word.isupper=False
-0.177,+1:word.isdigit=False
-0.200,word.isupper=False
-0.221,word.isdigit=False
-0.243,bias
+0.243,bias

Weight?,Feature
0.329,-1:word.lower=``
0.329,-1:postag=``
0.322,postag=NN
-0.106,word.istitle=False
-0.169,+1:word.isupper=False
-0.177,+1:word.isdigit=False
-0.2,word.isupper=False
-0.221,word.isdigit=False
-0.243,bias

Weight?,Feature
0.243,bias
0.221,word.isdigit=False
0.2,word.isupper=False
0.177,+1:word.isdigit=False
0.169,+1:word.isupper=False
0.106,word.istitle=False
-0.322,postag=NN
-0.329,-1:postag=``
-0.329,-1:word.lower=``


Memoized tokens are mostly gone and model now relies on word shapes and POS tags. There is only a few non-zero features remaining. In our example the change probably made the quality worse, but that’s a separate question.

In [11]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=True,# added
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,I,O
I,-0.412,-0.274
O,0.0,0.364


## customize 

In [13]:
eli5.show_weights(crf, top=10, targets=['I'])

From \ To,I
I,-0.412

Weight?,Feature
+1.629,-1:word.lower=for
+1.566,-1:word.lower=word
+1.408,-1:word.lower=say
+1.191,-1:word.lower=saying
+0.876,-1:postag=VBG
+0.867,-1:word.lower=of
+0.703,-1:postag=``
… 1189 more positive …,… 1189 more positive …
… 250 more negative …,… 250 more negative …
-0.715,+1:postag=NNP


check how word shape features are used by model using `feature_re argument` and hide transition table

In [14]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
-0.122,word.isupper=True
-0.16,word.istitle=True
-0.217,word.istitle=False
-0.325,word.isdigit=False
-0.33,word.isupper=False

Weight?,Feature
0.336,word.isdigit=True
0.33,word.isupper=False
0.325,word.isdigit=False
0.217,word.istitle=False
0.16,word.istitle=True
0.122,word.isupper=True


###  Formatting in console

In [17]:
expl = eli5.explain_weights(crf, top=5, targets=['O','I'])
print(eli5.format_as_text(expl))

Explained as: CRF

Transition features:
         O       I
--  ------  ------
O    0.364   0.000
I   -0.274  -0.412

y='O' top features
Weight  Feature             
------  --------------------
   … 2544 more positive …   
   … 791 more negative …    
-0.876  -1:postag=VBG       
-1.191  -1:word.lower=saying
-1.408  -1:word.lower=say   
-1.566  -1:word.lower=word  
-1.629  -1:word.lower=for   

y='I' top features
   … 1191 more positive …   
   … 253 more negative …    
Weight  Feature             
------  --------------------
+1.629  -1:word.lower=for   
+1.566  -1:word.lower=word  
+1.408  -1:word.lower=say   
+1.191  -1:word.lower=saying
+0.876  -1:postag=VBG       

