## Conditional Random Field Model (Zero Window)

In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from collections import OrderedDict

#Custom models
from prepro import readfile, get_sentence, is_number, extract_words,get_label,partial_tags

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

In [135]:
#import data from my github repo
test = readfile("test.txt")
train =readfile("train.txt")


## Orthographic Feature Engineering

In [136]:
#Engineered Features:
def mix(word):
    if word.isalnum():
        if re.search("^(?=.*[a-zA-Z])(?=.*[0-9])",word): return True
        else: return False
    else:
        return False

def non_intial(word):
    '''
    The Function the boolean if a non-initial captital letter is present
    '''
    return not word[1:].islower()

def contain_punct(word):
    '''
    The Function returns the boolean if punctuations is present in token
    '''
    if re.match(r'^\w+$',word):return False
    else: return True
    
def apostrophe(word):
    '''
    The Function returns boolean if "'s" is present in token 
    '''
    if word =="'s":return True
    else:return False
    
def word_pattern(word):
    '''
    The Function returns word patter feature
    Upper Case = "A"
    Lower Case = "a"
    Digit = "0"
    '''
    token=""
    for i in word:
        if i.isupper():
            token +="A"
        elif i.islower():
            token +="a"
        elif i.isdigit():
            token +="0"
        else:
            token +=str(i)
    return token

def pattern_sum(word):
    '''
    The Function returns the word patern without consectutive duplicates
    '''
    return ''.join(OrderedDict.fromkeys(word))
    

In [137]:
def word2features(sentence,i):
    word = sentence[i]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        
        'Prefix_2': word[:2], # prefix
        'Prefix_3': word[:3], # prefix
        
        'word.istitle()': word.istitle(), #starts with caps
        'word.isupper()': word.isupper(), #all caps
        'word.islower()': word.islower(), #all lower case
        'word.isdigit()': word.isdigit(), #all digits
        'word.isalpha()': word.isalpha(), #all letters
        
        'word.punct()': contain_punct(word),#contains punctuation
        'word.apost()': apostrophe(word), #is an apostrophe
        'word.non_intial': non_intial(word), #non-initial capitals
        'word.mix': mix(word), #mixture of letters and digits
        'word.pattern()': word_pattern(word), #word pattern
        'word.pattern_sum()': pattern_sum(word_pattern(word)), #word pattern summary
        
        'Suffix_2': word[-2:], # suffix
        'Suffix_3': word[-3:], # suffix
        
    }
    if i == 0:
        features['START'] = True
        
    if i == len(sentence)-1:
        features['END'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def get_all_labels(dataset):
    labels=[]
    for i in range(len(dataset)):
        #labels.append(partial_tags(get_label(dataset,i+1)))
        labels.append(get_label(dataset,i+1))
    return labels

In [138]:
#get the first sentence
sent = get_sentence(train,1)
label = get_label(train,1)

#print out first sentence
print(sent)
print(label)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


In [139]:
#Applt feature engineering
train_sents = get_all_sentences(train)
train_labels = get_all_labels(train)
test_sents = get_all_sentences(test)
test_labels = get_all_labels(test)

X_train = [sent2features(s) for s in train_sents]
y_train = train_labels
X_test = [sent2features(s) for s in test_sents]
y_test = test_labels


sub_labels=list(set([item for sublist in train_labels for item in sublist]))
sub_labels.remove("O")


## Condtional Random Field Model


In [140]:
%%time
crf4 = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.2,
          max_iterations=50,
          all_possible_transitions=False)

#training model
crf4.fit(X=X_train, y=y_train)

#generate predictions
pred = crf4.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_train,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_train,average='micro',labels=sub_labels)
print("F1=%.2f"%(score*100))

              precision    recall  f1-score   support

      B-MISC       1.00      0.22      0.36        37
       B-LOC       1.00      0.82      0.90        11
       I-PER       0.98      0.99      0.99     11128
       B-ORG       1.00      1.00      1.00        24
       I-ORG       0.95      0.93      0.94     10001
      I-MISC       0.96      0.95      0.95      4556
       I-LOC       0.95      0.96      0.96      8286

   micro avg       0.96      0.96      0.96     34043
   macro avg       0.98      0.84      0.87     34043
weighted avg       0.96      0.96      0.96     34043

F1=96.03
CPU times: user 22.7 s, sys: 256 ms, total: 23 s
Wall time: 22.1 s


# Predict on Test dataset

In [141]:
#prediction with best performaning model
pred = crf4.predict(X_test)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=y_test,labels=sub_labels)
print(report)

score=flat_f1_score(y_pred=pred, y_true=y_test,average='micro',labels=sub_labels)
print("F1=%.2f"%(score*100))

              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         9
       B-LOC       0.00      0.00      0.00         6
       I-PER       0.85      0.81      0.83      2773
       B-ORG       0.00      0.00      0.00         5
       I-ORG       0.71      0.74      0.72      2491
      I-MISC       0.74      0.76      0.75       909
       I-LOC       0.79      0.83      0.81      1919

   micro avg       0.78      0.79      0.78      8112
   macro avg       0.44      0.45      0.45      8112
weighted avg       0.78      0.79      0.78      8112

F1=78.26
