In [76]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/florianbreton/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/florianbreton/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [77]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')


In [78]:
tagged_sentence[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [79]:
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


In [80]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [82]:

def features(sentence,index):
    return {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'contains-': 1 if '-' in sentence[index] else 0 
    }


In [83]:
def untag(sentence):
    return [word for word,tag in sentence]


def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y

In [84]:
X_train,y_train = prepareData(train_set)
X_test,y_test = prepareData(test_set)

In [85]:
train_set[6]

[('The', 'DET'),
 ('competitive', 'ADJ'),
 ('rates', 'NOUN'),
 ('were', 'VERB'),
 ('generally', 'ADV'),
 ('offset', 'VERB'),
 ('*-1', 'X'),
 ('by', 'ADP'),
 ('hefty', 'ADJ'),
 ('fees', 'NOUN'),
 ('on', 'ADP'),
 ('various', 'ADJ'),
 ('services', 'NOUN'),
 ('.', '.')]

In [86]:
y_train[6]

['DET',
 'ADJ',
 'NOUN',
 'VERB',
 'ADV',
 'VERB',
 'X',
 'ADP',
 'ADJ',
 'NOUN',
 'ADP',
 'ADJ',
 'NOUN',
 '.']

In [87]:
X_train[6]

[{'is_first_capital': 1,
  'is_first_word': 1,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': '',
  'next_word': 'competitive',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'T',
  'prefix_2': 'Th',
  'prefix_3': 'The',
  'prefix_4': 'The',
  'suffix_1': 'e',
  'suffix_2': 'he',
  'suffix_3': 'The',
  'suffix_4': 'The',
  'contains-': 0},
 {'is_first_capital': 0,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'The',
  'next_word': 'rates',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'c',
  'prefix_2': 'co',
  'prefix_3': 'com',
  'prefix_4': 'comp',
  'suffix_1': 'e',
  'suffix_2': 've',
  'suffix_3': 'ive',
  'suffix_4': 'tive',
  'contains-': 0},
 {'is_first_capital': 0,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'competitive',
  'next_word': 'were',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'r',
  'prefix_2': 'ra',
  'prefix_3': 'rat',
  'prefix_4

In [70]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.01, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [71]:
y_pred=crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_)


0.9738471726864286

In [109]:
from itertools import chain

from sklearn.model_selection import learning_curve,GridSearchCV



import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve,GridSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
#from sklearn.grid_search import RandomizedSearchCV

import scipy
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)


# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=crf.classes_)

# search
rs = GridSearchCV(
    crf, {'c1': [1e-5, 0.01, 0.1, 0.5, 0.8, 1],'c1': [1e-5, 0.01, 0.1, 0.5, 0.8, 1]}, verbose=3)
rs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] c1=1e-05 ........................................................
[CV] ............................ c1=1e-05, score=0.967, total=   5.3s
[CV] c1=1e-05 ........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s


[CV] ............................ c1=1e-05, score=0.966, total=   5.3s
[CV] c1=1e-05 ........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.6s remaining:    0.0s


[CV] ............................ c1=1e-05, score=0.967, total=   5.5s
[CV] c1=0.01 .........................................................
[CV] ............................. c1=0.01, score=0.967, total=   5.4s
[CV] c1=0.01 .........................................................
[CV] ............................. c1=0.01, score=0.966, total=   5.5s
[CV] c1=0.01 .........................................................
[CV] ............................. c1=0.01, score=0.967, total=   5.6s
[CV] c1=0.1 ..........................................................
[CV] .............................. c1=0.1, score=0.966, total=   5.3s
[CV] c1=0.1 ..........................................................
[CV] .............................. c1=0.1, score=0.965, total=   5.3s
[CV] c1=0.1 ..........................................................
[CV] .............................. c1=0.1, score=0.966, total=   5.5s
[CV] c1=0.5 ..........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.6min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                           all_possible_transitions=True, averaging=None,
                           c=None, c1=None, c2=None,
                           calibration_candidates=None, calibration_eta=None,
                           calibration_max_trials=None, calibration_rate=None,
                           calibration_samples=None, delta=None, epsilon=None,
                           error_sensitive=None, g...,
                           keep_tempfiles=None, linesearch=None,
                           max_iterations=100, max_linesearch=None,
                           min_freq=None, model_filename=None,
                           num_memories=None, pa_type=None, period=None,
                           trainer_cls=None, variance=None, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'c1': [1e-05, 0.01, 0.1, 0.5, 0.8, 1]},
     

In [107]:
grid = GridSearchCV(
    crf, {'c1': [1e-5, 0.01, 0.1, 0.5, 0.8, 1],'c1': [1e-5, 0.01, 0.1, 0.5, 0.8, 1]}, verbose=3)


<scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa0007fee48>


In [110]:
crf.get_params


<bound method BaseEstimator.get_params of CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)>