In [None]:
! pip install python-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install -q datasets
!pip install -q nltk

from datasets import load_dataset
import nltk

nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")

dataset = load_dataset("conll2003")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def extract_features_1(sentence, idx):
    word = sentence[idx][0]
    pos_tag = nltk.pos_tag([word])[0][1]
    
    features = {
        "word.lower()": word.lower(),
        "word[-3:]": word[-3:],
        "word[-2:]": word[-2:],
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
        "pos": pos_tag
    }
    
    if idx > 0:
        prev_word = sentence[idx - 1][0]
        prev_pos_tag = nltk.pos_tag([prev_word])[0][1]
        features.update({
            "-1:word.lower()": prev_word.lower(),
            "-1:word.istitle()": prev_word.istitle(),
            "-1:word.isupper()": prev_word.isupper(),
            "-1:pos": prev_pos_tag
        })
    else:
        features["BOS"] = True

    if idx < len(sentence) - 1:
        next_word = sentence[idx + 1][0]
        next_pos_tag = nltk.pos_tag([next_word])[0][1]
        features.update({
            "+1:word.lower()": next_word.lower(),
            "+1:word.istitle()": next_word.istitle(),
            "+1:word.isupper()": next_word.isupper(),
            "+1:pos": next_pos_tag
        })
    else:
        features["EOS"] = True

    return features

def extract_features(sentence, idx, window_size=2):
    word = sentence[idx]
    pos = nltk.pos_tag([word])[0][1]

    features = {
        "word": word,
        "pos": pos,
        "word.lower()": word.lower(),
        "word.istitle()": word.istitle(),
        "word.isupper()": word.isupper(),
        "word.isdigit()": word.isdigit(),
    }

    for i in range(1, window_size + 1):
        if idx - i >= 0:
            word = sentence[idx - i]
            pos = nltk.pos_tag([word])[0][1]
            features.update({
                f"-{i}:word.lower()": word.lower(),
                f"-{i}:word.istitle()": word.istitle(),
                f"-{i}:word.isupper()": word.isupper(),
                f"-{i}:word.isdigit()": word.isdigit(),
                f"-{i}:pos": pos
            })
        if idx + i < len(sentence):
            word = sentence[idx + i]
            pos = nltk.pos_tag([word])[0][1]
            features.update({
                f"+{i}:word.lower()": word.lower(),
                f"+{i}:word.istitle()": word.istitle(),
                f"+{i}:word.isupper()": word.isupper(),
                f"+{i}:word.isdigit()": word.isdigit(),
                f"+{i}:pos": pos
            })

    return features


def extract_sentences_features(sentence):
    return [extract_features(sentence, idx) for idx in range(len(sentence))]


def extract_labels(sentence):
    return [str(label) for label in sentence]


In [None]:
X_train = [extract_sentences_features(sentence) for sentence in dataset["train"]["tokens"]]


X_val = [extract_sentences_features(sentence) for sentence in dataset["validation"]["tokens"]]


X_test = [extract_sentences_features(sentence) for sentence in dataset["test"]["tokens"]]



In [None]:
y_train = [extract_labels(sentence) for sentence in dataset["train"]["ner_tags"]]

y_val = [extract_labels(sentence) for sentence in dataset["validation"]["ner_tags"]]

y_test = [extract_labels(sentence) for sentence in dataset["test"]["ner_tags"]]


In [None]:
import pycrfsuite
import warnings

trainer = pycrfsuite.Trainer(verbose=False)

for x_seq, y_seq in zip(X_train, y_train):
    trainer.append(x_seq, y_seq)

trainer.set_params({

    "max_iterations": 50,
    "feature.possible_transitions": True
})
trainer.train("conll2003_ner_crf.crfsuite")


tagger = pycrfsuite.Tagger()
tagger.open("conll2003_ner_crf.crfsuite")

y_pred = [tagger.tag(x_seq) for x_seq in X_test]

target_names = dataset["test"].features["ner_tags"].feature.names
ner_labels = list(range(len(target_names)))

y_test_flattened = [label for sentence in y_test for label in sentence]
y_pred_flattened = [label for sentence in y_pred for label in sentence]

from sklearn.metrics import classification_report

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    report = classification_report(y_test_flattened, y_pred_flattened, labels=ner_labels, target_names=target_names)
print (report)

              precision    recall  f1-score   support

           O       0.99      0.98      0.99     38323
       B-PER       0.81      0.83      0.82      1617
       I-PER       0.86      0.92      0.89      1156
       B-ORG       0.76      0.71      0.73      1661
       I-ORG       0.70      0.76      0.73       835
       B-LOC       0.77      0.85      0.81      1668
       I-LOC       0.72      0.69      0.71       257
      B-MISC       0.71      0.71      0.71       702
      I-MISC       0.64      0.67      0.65       216

   micro avg       0.95      0.95      0.95     46435
   macro avg       0.77      0.79      0.78     46435
weighted avg       0.95      0.95      0.95     46435

