<a href="https://colab.research.google.com/github/farnaz-orooji/nlp/blob/main/NER_with_CRF_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
from sklearn.metrics import classification_report

In [None]:
%%time
ds = pd.read_csv("//content/drive/MyDrive/nlp_dataset/ner_dataset.csv", encoding = 'ISO-8859-1')
ds = ds.fillna(method='ffill')

In [None]:
# this class is to retrieve sentenses with their POS and tags:
class SentenceGetter(object):
    
    def __init__(self, data):
        
        self.n_sent = 1
        self.data = data
        self.empttyt = False
        agg_func = lambda s : [(w,p,t) for w,p,t in 
                              zip(s["Word"].values.tolist(),
                                 s["POS"].values.tolist(),
                                 s["Tag"].values.tolist())
                              ]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        
        try:
            print("**")
            s = self.grouped['Sentenc: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        
        except:
            
            return None
        
getter = SentenceGetter(ds)
sentences = getter.sentences

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
#         'word[-5:]': word[-5:],
#         'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
#         'postag[:2]': postag[:2],
#         'postag[:3]': postag[:3],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

In [None]:
def sent2features(sent):

    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):

    return [label for token, postag, label in sent]

def sent2tokens(sent):

    return [token for token, postag, label in sent]

In [None]:
X = [sent2features(s) for s in sentences]

In [None]:
y =[sent2labels(s) for s in sentences]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.3, 
                                                   random_state = 0
                                                   )

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1 = 0.1,
    c2 = 0.1,
    max_iterations= 50,
    all_possible_transitions= True,
    verbose=1
)
crf.fit(X_train, y_train)

In [None]:
# evaluation 
y_pred = crf.predict(X_test)

In [None]:
print(metrics.flat_classification_report(y_test, y_pred, new_classes))

In [None]:
# what classifier learned : 
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features :
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

In [None]:
print("Top likely features : ")
print_transitions(Counter(crf.transition_features_).most_common(10))

In [None]:
print("\n Top unlikely features : ")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

In [None]:
import eli5

eli5.show_weights(crf, top= 10)

In [None]:
eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])

In [None]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])