<a href="https://colab.research.google.com/github/deepak1195/NaturalLanguageProcessing/blob/main/017_ConditionalRandomField.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sklearn-crfsuite



In [2]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [3]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [4]:
corpus=nltk.corpus.treebank.tagged_sents()
corpus

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [5]:
def wFeatures(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sent) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sent[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sent)-1 else sent[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

In [6]:
X = []
Y = []
for sent in corpus:
    # print(sent)
    xSentence = []
    ySentence = []
    for i in range(len(sent)):
        xSentence.append(wFeatures(sent, i))
        # print(xSentence[i])
        ySentence.append(sent[i][1])
        # print(ySentence[i])
    X.append(xSentence)
    # print(X)
    Y.append(ySentence)
    # print(Y)
    # input()

In [7]:
X[0]

[{'word': 'Pierre',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': False,
  'prefix-1': 'P',
  'prefix-2': 'Pi',
  'prefix-3': 'Pie',
  'suffix-1': 'e',
  'suffix-2': 're',
  'suffix-3': 'rre',
  'prev_word': '',
  'next_word': 'Vinken',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': False},
 {'word': 'Vinken',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': False,
  'prefix-1': 'V',
  'prefix-2': 'Vi',
  'prefix-3': 'Vin',
  'suffix-1': 'n',
  'suffix-2': 'en',
  'suffix-3': 'ken',
  'prev_word': 'Pierre',
  'next_word': ',',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': False},
 {'word': ',',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': True,
  'is_all_lower': True,
  'prefix-1': ',',
  'prefix-2': ',',
  'prefix-3': ',',
  'suffix-1': ',',
  'suffix-2': ',',
  'suffix-3': ',',
  'prev

In [8]:
Y[0]

['NNP',
 'NNP',
 ',',
 'CD',
 'NNS',
 'JJ',
 ',',
 'MD',
 'VB',
 'DT',
 'NN',
 'IN',
 'DT',
 'JJ',
 'NN',
 'NNP',
 'CD',
 '.']

In [9]:
# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = Y[:split]
X_test = X[split:]
y_test = Y[split:]

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [11]:
y_pred = crf.predict(X_test)
metrics.flat_accuracy_score(y_test, y_pred)

0.9632716203403363