## Conditional Random Field Model (Window 2)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, get_sentence, is_number, extract_words,get_label,partial_tags

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

In [2]:
#import data from my github repo
test = readfile("test.txt")
train =readfile("train.txt")

# filenames = ['train.txt', 'valid.txt']
# with open('Combined.txt', 'w') as outfile:
#     for fname in filenames:
#         with open(fname) as infile:
#             for line in infile:
#                 outfile.write(line)
                
# train =readfile("Combined.txt")

## Orthographic Feature Engineering

In [3]:
#Special Features:
def count_vowel(word):
    '''
    Function returns the number of vowels in token
    '''
    return sum(list(map(word.lower().count, "aeiou")))

def dash(word):
    '''
    The Function returns whether or not the token contains a dash
    '''
    return 1 if "-" in word else 0

def count_consonants(word):
    '''
    The Function returns the number of consonants in a token
    '''
    vowels="aeiou"
    return sum(i not in vowels for i in word)

In [4]:
def word2features(sentence,i):
    word = sentence[i]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'Prefix_2': word[:2],
        'Prefix_3': word[:3], 
        'word.isupper()': word.isupper(),
        'word.islower()': word.islower(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        'word.isalnum()': word.isalnum(), 
        "vowels": count_vowel(word),
        "dash": dash(word),
        "consonants": count_consonants(word),
        'Suffix_2': word[-2:],
        'Suffix_3': word[-3:],
        
    }
    if  i == 1 :
        '''
        Get the token before in the sentence
        '''
        word1 = sentence[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word Prefix_2': word1[:2],
            '-1:word Prefix_3': word1[:3], 
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.islower()': word1.lower(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:word.isalnum()': word1.isalnum(), 
            "-1:word.vowels": count_vowel(word1),
            "-1:word.dash": dash(word1),
            "-1:word.consonants": count_consonants(word1),
            '-1:word.Suffix_2': word1[-2:],
            '-1:word.Suffix_3': word1[-3:],
        })
        
    elif i>1 :
        word1 = sentence[i-1]
        word2 = sentence[i-2]
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word Prefix_2': word1[:2],
            '-1:word Prefix_3': word1[:3], 
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.islower()': word1.lower(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:word.isalnum()': word1.isalnum(), 
            "-1:word.vowels": count_vowel(word1),
            "-1:word.dash": dash(word1),
            "-1:word.consonants": count_consonants(word1),
            '-1:word.Suffix_2': word1[-2:],
            '-1:word.Suffix_3': word1[-3:],
            
            '-2:word.lower()': word2.lower(),
            '-2:word Prefix_2': word2[:2],
            '-2:word Prefix_3': word2[:3], 
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.islower()': word2.lower(),
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.isalpha()': word2.isalpha(),
            '-2:word.isalnum()': word2.isalnum(), 
            "-2:word.vowels": count_vowel(word2),
            "-2:word.dash": dash(word2),
            "-2:word.consonants": count_consonants(word2),
            '-2:word.Suffix_2': word2[-2:],
            '-2:word.Suffix_3': word2[-3:],
        })
        
    else:
        features['Start'] = True
        
    if i == len(sentence)-2:
        '''
        Get the token after in the sentence
        '''
        word1 = sentence[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.Prefix_2': word1[:2],
            '+1:word.Prefix_3': word1[:3], 
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.islower()': word1.lower(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalpha()': word1.isalpha(),
            '+1:word.isalnum()': word1.isalnum(), 
            "+1:word.vowels": count_vowel(word1),
            "+1:word.dash": dash(word1),
            "+1:word.consonants": count_consonants(word1),
            '+1:word.Suffix_2': word1[-2:],
            '+1:word.Suffix_3': word1[-3:],
        })
        
    elif i < len(sentence)-2:
        
        word1 = sentence[i+1]
        word2 = sentence[i+2]
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.Prefix_2': word1[:2],
            '+1:word.Prefix_3': word1[:3], 
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.islower()': word1.lower(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalpha()': word1.isalpha(),
            '+1:word.isalnum()': word1.isalnum(), 
            "+1:word.vowels": count_vowel(word1),
            "+1:word.dash": dash(word1),
            "+1:word.consonants": count_consonants(word1),
            '+1:word.Suffix_2': word1[-2:],
            '+1:word.Suffix_3': word1[-3:],
            
            '+2:word.lower()': word2.lower(),
            '+2:word.Prefix_2': word2[:2],
            '+2:word.Prefix_3': word2[:3], 
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.islower()': word2.lower(),
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.isalpha()': word2.isalpha(),
            '+2:word.isalnum()': word2.isalnum(), 
            "+2:word.vowels": count_vowel(word2),
            "+2:word.dash": dash(word2),
            "+2:word.consonants": count_consonants(word2),
            '+2:word.Suffix_2': word2[-2:],
            '+2:word.Suffix_3': word2[-3:],
        })
        
    else:
        features['End'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def get_all_labels(dataset):
    labels=[]
    for i in range(len(dataset)):
        #labels.append(partial_tags(get_label(dataset,i+1)))
        labels.append(get_label(dataset,i+1))
    return labels

In [5]:
#get the first sentence
sent = get_sentence(train,1)
#label = partial_tags(get_label(train,1))
label = get_label(train,1)

#print out first sentence
print(sent)
print(label)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


In [6]:
#Applt feature engineering
train_sents = get_all_sentences(train)
train_labels = get_all_labels(train)
test_sents = get_all_sentences(test)
test_labels = get_all_labels(test)

X_train = [sent2features(s) for s in train_sents]
y_train = train_labels
X_test = [sent2features(s) for s in test_sents]
y_test = test_labels


sub_labels=list(set([item for sublist in train_labels for item in sublist]))
sub_labels.remove("O")


## Condtional Random Field Model

### Model 1
- Using Stochastic Gradient descent 

In [7]:
%%time
crf1 = CRF(algorithm='l2sgd',
          max_iterations=50,
          all_possible_transitions=False)

#training model
crf1.fit(X=X_train, y=y_train)

#generate predictions
pred = crf1.predict(X_train)
#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_train,labels=sub_labels)
print(report)
score=flat_f1_score(y_pred=pred, y_true=y_train,average='micro',labels=sub_labels)
print(score)

              precision    recall  f1-score   support

      I-MISC       0.98      0.96      0.97      4556
      B-MISC       1.00      0.51      0.68        37
       I-LOC       0.99      0.97      0.98      8286
       B-LOC       1.00      0.82      0.90        11
       I-ORG       0.97      0.98      0.98     10001
       B-ORG       1.00      1.00      1.00        24
       I-PER       0.99      0.99      0.99     11128

   micro avg       0.98      0.98      0.98     34043
   macro avg       0.99      0.89      0.93     34043
weighted avg       0.98      0.98      0.98     34043

0.9810832659660469
Wall time: 56.1 s


### Model 2:
- Algorithm :Stochastic Gradient descent
- L2 regularization (c2=0.1)

In [8]:
%%time
crf2 = CRF(algorithm='l2sgd',
          c2=0.1,
          max_iterations=50,
          all_possible_transitions=False)

#training model
crf2.fit(X=X_train, y=y_train)
         
#generate predictions
pred = crf2.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_train,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_train,average='micro',labels=sub_labels)
print(score)

              precision    recall  f1-score   support

      I-MISC       0.99      0.98      0.99      4556
      B-MISC       1.00      0.84      0.91        37
       I-LOC       0.99      0.99      0.99      8286
       B-LOC       1.00      1.00      1.00        11
       I-ORG       1.00      0.97      0.98     10001
       B-ORG       1.00      1.00      1.00        24
       I-PER       0.99      0.99      0.99     11128

   micro avg       0.99      0.98      0.99     34043
   macro avg       1.00      0.97      0.98     34043
weighted avg       0.99      0.98      0.99     34043

0.9886407222730358
Wall time: 55.8 s


## Model 3

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0)
- L2 regularization (c2=0)

In [9]:
%%time
crf3 = CRF(algorithm='lbfgs',
          #c1=0.1,
          #c2=0.1,
          max_iterations=50,
          all_possible_transitions=False)

#training model
crf3.fit(X=X_train, y=y_train)

#generate predictions
pred = crf3.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_train,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_train,average='micro',labels=sub_labels)
print(score)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      I-MISC       0.74      0.56      0.64      4556
      B-MISC       0.00      0.00      0.00        37
       I-LOC       0.73      0.79      0.76      8286
       B-LOC       0.00      0.00      0.00        11
       I-ORG       0.75      0.69      0.72     10001
       B-ORG       0.00      0.00      0.00        24
       I-PER       0.83      0.83      0.83     11128

   micro avg       0.77      0.74      0.76     34043
   macro avg       0.44      0.41      0.42     34043
weighted avg       0.77      0.74      0.75     34043

0.7555003070287108
Wall time: 47.4 s


## Model 4

- Algorithm : Gradient descent using the L-BFGS method
- L1 regularization (c2=0.1)
- L2 regularization (c2=0.1)

In [10]:
%%time
crf4 = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=50,
          all_possible_transitions=False)

#training model
crf4.fit(X=X_train, y=y_train)

#generate predictions
pred = crf4.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_train,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_train,average='micro',labels=sub_labels)
print(score)

              precision    recall  f1-score   support

      I-MISC       1.00      0.98      0.99      4556
      B-MISC       0.89      0.86      0.88        37
       I-LOC       1.00      0.99      1.00      8286
       B-LOC       1.00      0.91      0.95        11
       I-ORG       0.99      1.00      0.99     10001
       B-ORG       1.00      1.00      1.00        24
       I-PER       1.00      1.00      1.00     11128

   micro avg       0.99      0.99      0.99     34043
   macro avg       0.98      0.96      0.97     34043
weighted avg       0.99      0.99      0.99     34043

0.9944754628269175
Wall time: 45.3 s


# Predict on Test dataset

In [11]:
#prediction with best performaning model
pred = crf4.predict(X_test)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_test,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_test,average='micro',labels=sub_labels)
print(score)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      I-MISC       0.82      0.75      0.78       909
      B-MISC       0.00      0.00      0.00         9
       I-LOC       0.89      0.84      0.86      1919
       B-LOC       0.00      0.00      0.00         6
       I-ORG       0.80      0.79      0.79      2491
       B-ORG       0.00      0.00      0.00         5
       I-PER       0.87      0.91      0.89      2773

   micro avg       0.85      0.84      0.84      8112
   macro avg       0.48      0.47      0.48      8112
weighted avg       0.84      0.84      0.84      8112

0.8405383613471438
