# Conditional Random Field Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, get_sentence, is_number, extract_words,get_label,partial_tags

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

In [2]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [3]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])   

## Data

In [4]:
tags = partial_tags(tags)
words=list(words)
n_words = len(set(words))
n_tags = len(set(tags))

setlabel=list(set(tags))
setlabel.remove('O')

print("Number of in the dataset: ", len(words))
print("Number of unique words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of in the dataset:  250056
Number of unique words in the dataset:  27316
Number of tags in the dataset:  5


## Feature Engineering

In [5]:
def word2features(sentence,i):
    word = sentence[i]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.islower()': word.islower(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha()
    }
    if i > 0:
        word1 = sentence[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.islower()': word1.lower(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha()
        })
    else:
        features['BOS'] = True
        
    if i < len(sentence)-1:
        word1 = sentence[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.islower()': word1.lower(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalpha()': word1.isalpha()
        })
    else:
        features['EOS'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def get_all_labels(dataset):
    labels=[]
    for i in range(len(dataset)):
        labels.append(partial_tags(get_label(dataset,i+1)))
    return labels

In [17]:
#Get all the sentences in the form of a list
sentences = get_all_sentences(train)
labels = get_all_labels(train)

#get the first sentence
sent = get_sentence(train,1)
label = partial_tags(get_label(train,1))

#print out first sentence
print(sent)
print(label)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']


In [29]:
#Applt feature engineering
X = [sent2features(s) for s in sentences]
y = labels


## Condtional Random Field Model

### Model 1
- Using Stochastic Gradient descent 

In [9]:
crf1 = CRF(algorithm='l2sgd',
          max_iterations=100,
          all_possible_transitions=False)

In [10]:
%%time
#training model
crf1.fit(X=X, y=y)

CPU times: user 44.2 s, sys: 1.78 s, total: 46 s
Wall time: 48 s


CRF(algorithm='l2sgd', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [11]:
#generate predictions
pred = crf1.predict(X)
#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)
score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

              precision    recall  f1-score   support

         ORG       0.97      0.94      0.95     10025
         LOC       0.96      0.96      0.96      8297
        MISC       0.96      0.93      0.95      4593
         PER       0.97      0.98      0.97     11128

   micro avg       0.96      0.96      0.96     34043
   macro avg       0.96      0.95      0.96     34043
weighted avg       0.96      0.96      0.96     34043

0.9600097604051628


### Model 2:
- Algorithm :Stochastic Gradient descent
- L2 regularization (c2=0.1)

In [12]:
%%time
crf2 = CRF(algorithm='l2sgd',
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf2.fit(X=X, y=y)
         
#generate predictions
pred = crf2.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

              precision    recall  f1-score   support

         ORG       1.00      0.99      0.99     10025
         LOC       1.00      0.99      0.99      8297
        MISC       1.00      1.00      1.00      4593
         PER       1.00      1.00      1.00     11128

   micro avg       1.00      1.00      1.00     34043
   macro avg       1.00      1.00      1.00     34043
weighted avg       1.00      1.00      1.00     34043

0.9960750447217874
CPU times: user 51.2 s, sys: 2.01 s, total: 53.2 s
Wall time: 54.3 s


## Model 3

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0)
- L2 regularization (c2=0)

In [13]:
%%time
crf3 = CRF(algorithm='lbfgs',
          #c1=0.1,
          #c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf3.fit(X=X, y=y)

#generate predictions
pred = crf3.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

              precision    recall  f1-score   support

         ORG       0.93      0.91      0.92     10025
         LOC       0.94      0.94      0.94      8297
        MISC       0.94      0.89      0.91      4593
         PER       0.95      0.97      0.96     11128

   micro avg       0.94      0.93      0.94     34043
   macro avg       0.94      0.93      0.93     34043
weighted avg       0.94      0.93      0.94     34043

0.9373407048558303
CPU times: user 46.2 s, sys: 1.61 s, total: 47.8 s
Wall time: 47.6 s


## Model 4

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0.1)
- L2 regularization (c2=0.1)

In [14]:
%%time
crf4 = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf4.fit(X=X, y=y)

#generate predictions
pred = crf4.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

              precision    recall  f1-score   support

         ORG       1.00      0.99      0.99     10025
         LOC       0.99      0.99      0.99      8297
        MISC       1.00      0.99      1.00      4593
         PER       1.00      1.00      1.00     11128

   micro avg       1.00      0.99      1.00     34043
   macro avg       1.00      0.99      1.00     34043
weighted avg       1.00      0.99      1.00     34043

0.9956931694778802
CPU times: user 42.8 s, sys: 512 ms, total: 43.3 s
Wall time: 42 s


# Predict on Test dataset

In [15]:
#Get all the sentences in the form of a list
sentences = get_all_sentences(test)
labels = get_all_labels(test)

X = [sent2features(s) for s in sentences]
y = labels

In [16]:
#prediction with best performaning model
pred = crf4.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

              precision    recall  f1-score   support

         ORG       0.79      0.77      0.78      2496
         LOC       0.86      0.82      0.83      1925
        MISC       0.77      0.77      0.77       918
         PER       0.86      0.89      0.88      2773

   micro avg       0.83      0.82      0.83      8112
   macro avg       0.82      0.81      0.82      8112
weighted avg       0.83      0.82      0.82      8112

0.8246638067655477
