<a href="https://colab.research.google.com/github/deanhoperobertson/Named-Enitty-Recognition/blob/master/Models/Final%20Models/CRF/Conditional%20Random%20Fields%20(Window%201).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 11.4MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:
import pandas as pd
import numpy as np
import urllib.request
import re
from collections import OrderedDict

#grid search
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score, flat_recall_score, flat_precision_score

In [None]:
def readstring(filename, meth):
    f = filename.split('\n')
    sentences = []
    sentence = []
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        if meth.lower()=="numbers":
            sentence.append([hasNumbers(splits[0]), splits[-1].strip()])
        else:
            sentence.append([splits[0], splits[-1].strip()])
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    return sentences

In [None]:
#import data from my github repo
train_url = "https://raw.githubusercontent.com/deanhoperobertson/Named-Enitty-Recognition/master/Data/train.txt"
test_url = "https://raw.githubusercontent.com/deanhoperobertson/Named-Enitty-Recognition/master/Data/test.txt"
train = urllib.request.urlopen(train_url).read()
test = urllib.request.urlopen(test_url).read()
train = train.decode('utf-8')
test = test.decode('utf-8')

#preproces the txt file
train = readstring(train,"NONE")
test = readstring(test, "NONE")


## Orthographic Feature Engineering

In [None]:
#Engineered Features:
def mix(word):
    if word.isalnum():
        if re.search("^(?=.*[a-zA-Z])(?=.*[0-9])",word): return True
        else: return False
    else:
        return False

def non_intial(word):
    '''
    The Function the boolean if a non-initial captital letter is present
    '''
    return not word[1:].islower()

def contain_punct(word):
    '''
    The Function returns the boolean if punctuations is present in token
    '''
    if re.match(r'^\w+$',word):return False
    else: return True
    
def apostrophe(word):
    '''
    The Function returns boolean if "'s" is present in token 
    '''
    if word =="'s":return True
    else:return False
    
def word_pattern(word):
    '''
    The Function returns word patter feature
    Upper Case = "A"
    Lower Case = "a"
    Digit = "0"
    '''
    token=""
    for i in word:
        if i.isupper():
            token +="A"
        elif i.islower():
            token +="a"
        elif i.isdigit():
            token +="0"
        else:
            token +=str(i)
    return token

def pattern_sum(word):
    '''
    The Function returns the word patern without consectutive duplicates
    '''
    return ''.join(OrderedDict.fromkeys(word))

In [None]:
def word2features(sentence,i):
    word = sentence[i]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        
        'Prefix_2': word[:2], # prefix
        'Prefix_3': word[:3], # prefix
        
        'word.istitle()': word.istitle(), #starts with caps
        'word.isupper()': word.isupper(), #all caps
        'word.islower()': word.islower(), #all lower case
        'word.isdigit()': word.isdigit(), #all digits
        'word.isalpha()': word.isalpha(), #all letters
        
        'word.punct()': contain_punct(word),#contains punctuation
        'word.apost()': apostrophe(word), #is an apostrophe
        'word.non_intial': non_intial(word), #non-initial capitals
        'word.mix': mix(word), #mixture of letters and digits
        'word.pattern()': word_pattern(word), #word pattern
        'word.pattern_sum()': pattern_sum(word_pattern(word)), #word pattern summary
        
        'Suffix_2': word[-2:], # suffix
        'Suffix_3': word[-3:], # suffix
        
    }
    if i > 0:
        '''
        Get the token before in the sentence
        '''
        word1 = sentence[i-1]
        features.update({
            
          '-1word.lower()': word1.lower(),
          '-1Prefix_2':word1[:2],
          '-1Prefix_3':word1[:3], 
          '-1word.istitle()': word1.istitle(), 
          '-1word.isupper()': word1.isupper(), 
          '-1word.islower()': word1.islower(),
          '-1word.isdigit()': word1.isdigit(), 
          '-1word.isalpha()': word1.isalpha(), 
          '-1word.punct()': contain_punct(word1),
          '-1word.apost()': apostrophe(word1),
          '-1word.non_intial': non_intial(word1),
          '-1word.mix': mix(word1),
          '-1word.pattern()': word_pattern(word1),
          '-1word.pattern_sum()': pattern_sum(word_pattern(word1)), 
          '-1Suffix_2':word1[-2:], 
          '-1Suffix_3':word1[-3:], 
        })
    else:
        features['START'] = True
        
    if i < len(sentence)-1:
        '''
        Get the token after in the sentence
        '''
        word_1 = sentence[i+1]
        features.update({
          '+1word.lower()': word_1.lower(),
          '+1Prefix_2':word_1[:2],
          '+1Prefix_3':word_1[:3], 
          '+1word.istitle()': word_1.istitle(), 
          '+1word.isupper()': word_1.isupper(), 
          '+1word.islower()': word_1.islower(),
          '+1word.isdigit()': word_1.isdigit(), 
          '+1word.isalpha()': word_1.isalpha(), 
          '+1word.punct()': contain_punct(word_1),
          '+1word.apost()': apostrophe(word_1),
          '+1word.non_intial': non_intial(word_1),
          '+1word.mix': mix(word_1),
          '+1word.pattern()': word_pattern(word_1),
          '+1word.pattern_sum()': pattern_sum(word_pattern(word_1)), 
          '+1Suffix_2':word_1[-2:], 
          '+1Suffix_3':word_1[-3:], 
        })
    else:
        features['END'] = True
    
    return features

def get_sentence(dataset,sentence_number):
    sentence = []
    for i in dataset[sentence_number-1]:
        sentence.append(i[0])
    return(sentence)


def get_label(dataset,sentence_number):
    sentence = []
    for i in dataset[sentence_number-1]:
        sentence.append(i[1])
    return(sentence)

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def get_all_labels(dataset):
    labels=[]
    for i in range(len(dataset)):
        #labels.append(partial_tags(get_label(dataset,i+1)))
        labels.append(get_label(dataset,i+1))
    return labels

In [None]:
#Apply feature engineering
train_sents = get_all_sentences(train)
train_labels = get_all_labels(train)
test_sents = get_all_sentences(test)
test_labels = get_all_labels(test)

X_train = [sent2features(s) for s in train_sents]
y_train = train_labels
X_test = [sent2features(s) for s in test_sents]
y_test = test_labels


sub_labels=list(set([item for sublist in train_labels for item in sublist]))
sub_labels.remove("O")
sub_labels.sort(reverse=True)

## Train Model

In [None]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=50,
    c1=0.1,
    c2=0.3,
    all_possible_transitions=False
)


In [None]:
%%time
crf.fit(X_train, y_train)

CPU times: user 34.4 s, sys: 210 ms, total: 34.6 s
Wall time: 34.6 s




CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.3, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Predict On Training Set

In [None]:
#predict
y_preds = crf.predict(X_train)

report = flat_classification_report(y_pred=y_preds, y_true=y_train, labels=sub_labels)
print(report)

#F1 Score
score=flat_f1_score(y_pred=y_preds, y_true=y_train, average='micro',labels=sub_labels)
print("F1 score: %.2f" %round(score*100,3))  

              precision    recall  f1-score   support

       I-PER       1.00      1.00      1.00     11128
       I-ORG       0.99      0.99      0.99     10001
      I-MISC       0.99      0.99      0.99      4556
       I-LOC       0.99      0.99      0.99      8286
       B-ORG       1.00      1.00      1.00        24
      B-MISC       0.96      0.73      0.83        37
       B-LOC       1.00      0.91      0.95        11

   micro avg       0.99      0.99      0.99     34043
   macro avg       0.99      0.94      0.96     34043
weighted avg       0.99      0.99      0.99     34043

F1 score: 99.26


## Predict On Test Set

In [None]:
#predict
y_preds = crf.predict(X_test)

report = flat_classification_report(y_pred=y_preds, y_true=y_test, labels=sub_labels)
print(report)

#F1 Score
f1_score=flat_f1_score(y_pred=y_preds, y_true=y_test, average='micro',labels=sub_labels)
recall_score = flat_recall_score(y_pred=y_preds, y_true=y_test, average='micro', labels=sub_labels)
pre_score = flat_precision_score(y_pred=y_preds, y_true=y_test, average='micro', labels=sub_labels)
print("Precision score: %.2f" %round(pre_score*100,3))
print("Recall score: %.2f" %round(recall_score*100,3))
print("F1 score: %.2f" %round(f1_score*100,3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       I-PER       0.86      0.89      0.88      2773
       I-ORG       0.79      0.78      0.79      2491
      I-MISC       0.77      0.77      0.77       909
       I-LOC       0.87      0.87      0.87      1919
       B-ORG       0.00      0.00      0.00         5
      B-MISC       0.00      0.00      0.00         9
       B-LOC       0.00      0.00      0.00         6

   micro avg       0.83      0.84      0.84      8112
   macro avg       0.47      0.47      0.47      8112
weighted avg       0.83      0.84      0.83      8112

Precision score: 83.21
Recall score: 83.88
F1 score: 83.54


In [None]:
from google.colab import files
from sklearn.metrics import confusion_matrix

sub_labels.append("O")
flat_preds = [item for sublist in y_preds for item in sublist]
flat_true = [item for sublist in y_test for item in sublist]
matrix=confusion_matrix(flat_preds,flat_true, labels=sub_labels)

cm = pd.DataFrame(matrix, columns = sub_labels, index=sub_labels)

cm.to_csv('W1_matrix_.csv')
files.download('W1_matrix_.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>