# Conditional Random Field Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, readstring, get_sentence, is_number, extract_words,get_label

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

In [2]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [3]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])   

## Tokenization

In [4]:
words=list(words)
n_words = len(set(words))
n_tags = len(set(tags))

print("Number of in the dataset: ", len(words))
print("Number of unique words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of in the dataset:  250056
Number of unique words in the dataset:  27316
Number of tags in the dataset:  9


In [5]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 218128


In [6]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 250026


In [7]:
def tokenize(word_list):
    new_list= []
    for word in word_list:
        if is_number(word2idx[word]):
            new_list.append(word2idx[word])
    else:
        None
    return(new_list)

In [8]:
#training set
train_words, train_tags = extract_words(train)

#tokenize words into tokens
tr_words = tokenize(train_words)

## Feature Engineering

In [9]:
def word2features(sentence,i):
    word = sentence[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sentence[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
        
    if i < len(sentence)-1:
        word1 = sentence[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def get_all_labels(dataset):
    labels=[]
    for i in range(len(dataset)):
        labels.append(get_label(dataset,i+1))
    return labels

In [10]:
#Get all the sentences in the form of a list
sentences = get_all_sentences(train)
labels = get_all_labels(train)

#get the first sentence
sent = get_sentence(train,1)
label = get_label(train,1)

#print out first sentence
print(sent)
print(label)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [11]:
X = [sent2features(s) for s in sentences]
y = labels

In [12]:
get_sentence(train,1)
get_label(train,1)

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
#Remove the 'O' label
setlabel = list(set(tags))
setlabel.remove('O')
setlabel


## Condtional Random Field Model

### Model 1
- Using Stochastic Gradient descent 

In [13]:
crf1 = CRF(algorithm='l2sgd',
          max_iterations=100,
          all_possible_transitions=False)

In [14]:
%%time
#training model
crf1.fit(X=X, y=y)

Wall time: 16.9 s


CRF(algorithm='l2sgd', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [26]:
#generate predictions
pred = crf1.predict(X)
#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)
score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

             precision    recall  f1-score   support

      I-ORG       0.48      0.56      0.52      3704
     B-MISC       0.46      0.42      0.44      3438
      B-PER       0.66      0.66      0.66      6600
      I-LOC       0.54      0.48      0.51      1157
      B-ORG       0.61      0.48      0.54      6321
      B-LOC       0.61      0.61      0.61      7140
     I-MISC       0.35      0.40      0.37      1155
      I-PER       0.74      0.84      0.79      4528

avg / total       0.60      0.59      0.59     34043

0.590959066309224


### Model 2:
- Algorithm :Stochastic Gradient descent
- L2 regularization (c2=0.1)

In [27]:
%%time
crf2 = CRF(algorithm='l2sgd',
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf2.fit(X=X, y=y)
         
#generate predictions
pred = crf2.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

             precision    recall  f1-score   support

      I-ORG       0.61      0.41      0.49      3704
     B-MISC       0.45      0.43      0.44      3438
      B-PER       0.56      0.72      0.63      6600
      I-LOC       0.68      0.34      0.45      1157
      B-ORG       0.59      0.45      0.51      6321
      B-LOC       0.65      0.54      0.59      7140
     I-MISC       0.54      0.24      0.33      1155
      I-PER       0.59      0.93      0.73      4528

avg / total       0.59      0.57      0.56     34043

0.5611813191452248
Wall time: 17.6 s


## Model 3

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0)
- L2 regularization (c2=0)

In [28]:
%%time
crf3 = CRF(algorithm='lbfgs',
          #c1=0.1,
          #c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf3.fit(X=X, y=y)

#generate predictions
pred = crf3.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

             precision    recall  f1-score   support

      I-ORG       0.47      0.59      0.52      3704
     B-MISC       0.51      0.35      0.41      3438
      B-PER       0.67      0.65      0.66      6600
      I-LOC       0.58      0.46      0.51      1157
      B-ORG       0.60      0.48      0.54      6321
      B-LOC       0.60      0.62      0.61      7140
     I-MISC       0.42      0.30      0.35      1155
      I-PER       0.74      0.85      0.79      4528

avg / total       0.60      0.58      0.59     34043

0.5885676690347503
Wall time: 14.8 s


## Model 4

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0.1)
- L2 regularization (c2=0.1)

In [29]:
%%time
crf4 = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#training model
crf4.fit(X=X, y=y)

#generate predictions
pred = crf4.predict(X)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y,labels=setlabel)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print(score)

             precision    recall  f1-score   support

      I-ORG       0.48      0.56      0.52      3704
     B-MISC       0.50      0.36      0.42      3438
      B-PER       0.67      0.65      0.66      6600
      I-LOC       0.57      0.44      0.50      1157
      B-ORG       0.61      0.48      0.54      6321
      B-LOC       0.60      0.62      0.61      7140
     I-MISC       0.41      0.30      0.35      1155
      I-PER       0.73      0.85      0.79      4528

avg / total       0.60      0.58      0.59     34043

0.5874768725299797
Wall time: 16 s


In [35]:
#Overal Model performance
score1 =flat_f1_score(y, pred,average='weighted')
print("Overalll F1 Score:",score1)
#weightedt Model performance
score2=flat_f1_score(y_pred=pred, y_true=y,average='weighted',labels=setlabel)
print("Weighted F1 Score:",score2)

Overalll F1 Score: 0.9105776068585635
Weighted F1 Score: 0.5874768725299797
