# Named Entity Recognition Dataset

## Dataset

In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re 
import json
import spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Cleaning Entities

### Processing Indexes

In [21]:
# import logging
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [33]:
data = trim_entity_spans(convert_dataturks_to_spacy("/content/drive/MyDrive/Colab Notebooks/datasets/RP/NER_RESUME.json"))
data[3]

['Alok Khandai Operational Analyst (SQL DBA) Engineer - UNISYS  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Alok-Khandai/5be849e443b8f467  ❖ Having 3.5 Years of IT experience in SQL Database Administration, System Analysis, Design, Development & Support of MS SQL Servers in Production, Development environments & Replication and Cluster Server Environments. ❖ Working Experience with relational database such as SQL. ❖ Experience in Installation, Configuration, Maintenance and Administration of SQL Server. ❖ Experience in upgrading SQL Server. ❖ Good experience with implementing DR solution, High Availability of database servers using Database mirroring and replications and Log Shipping. ❖ Experience in implementing SQL Server security and Object permissions like maintaining Database authentication modes, creation of users, configuring permissions and assigning roles to users. ❖ Experience in creating Jobs, Alerts, SQL Mail Agent ❖ Experience in performing integrity checks. Me

### Overlapping Entities

In [13]:
len(data)

693

In [26]:
import pickle
import spacy
import random
import re
nlp = spacy.blank('en')

#train data
data1= pickle.load(open('/content/drive/MyDrive/Colab Notebooks/datasets/d.pkl','rb'))

In [15]:
# for i in range (len(data)):
#   data[i][0]=data[i][0].replace("\n", " ")
# print('Done!')

Done!


In [37]:
def clean_entities(training_data):
    
    clean_data = []
    for text, annotation in training_data:
        
        entities = annotation.get('entities')
        entities_copy = entities.copy()
        
        # append entity only if it is longer than its overlapping entity
        i = 0
        for entity in entities_copy:
            j = 0
            for overlapping_entity in entities_copy:
                # Skip self
                if i != j:
                    e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                    # Delete any entity that overlaps, keep if longer
                    if ((e_start >= oe_start and e_start <= oe_end) \
                    or (e_end <= oe_end and e_end >= oe_start)) \
                    and ((e_end - e_start) <= (oe_end - oe_start)):
                        try:
                          entities.remove(entity)
                        except:
                          pass
                j += 1
            i += 1
        clean_data.append((text, {'entities': entities}))
                
    return clean_data

data = clean_entities(data1)

### Entity Mapping

In [38]:
from spacy.lang.en import English  # Or whichever language you need
from spacy.gold import biluo_tags_from_offsets

def bilou_tags(data):
    
    docs  = []
    annots = []
    nlp = English()
    for text, annotations in data:
        offsets = annotations["entities"]
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, offsets)
        for i in range(len(tags)):
            if tags[i].startswith("U"):
                tags[i] = "B" + tags[i][1:]
            elif tags[i].startswith("L"):
                tags[i] = "I" + tags[i][1:]
            if not (doc[i].text.isalnum() or len(doc[i].text) > 1):
                tags[i] = "O"
        docs.append([token.text for token in doc])
        annots.append(tags)
        
    df_data = pd.DataFrame({'docs': docs, 'annots': annots})

    return df_data

df_data = bilou_tags(data1)
# [(k, v) for k, v in zip(df_data["docs"][0], df_data["annots"][0])]

### Removing Mislabeled Examples

In [39]:
for i in range(len(df_data)):
    if "-" in df_data.loc[i, "annots"]:
        df_data.drop(i, axis = "index", inplace = True)
df_data.reset_index(inplace = True)
len(df_data)

307

## Modeling

### Conditional Random Fields

#### Sentence Getter

In [41]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [42]:
from nltk import pos_tag
sentences = [[(w, p, t) for w, p, t in zip(df_data["docs"][i], [y for x, y in pos_tag(df_data["docs"][i])], df_data["annots"][i]) if w.isalnum() or len(w) > 1] for i in range(0, len(df_data))]

#### Feature Extraction

In [43]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2]
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2]
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

#### Train-Test Split

In [44]:
%%time
from sklearn.model_selection import train_test_split

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

CPU times: user 458 ms, sys: 57.7 ms, total: 515 ms
Wall time: 517 ms


#### Training

In [47]:
pip install python-crfsuite

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[?25l[K     |▍                               | 10 kB 23.8 MB/s eta 0:00:01[K     |▉                               | 20 kB 25.4 MB/s eta 0:00:01[K     |█▎                              | 30 kB 29.0 MB/s eta 0:00:01[K     |█▊                              | 40 kB 24.5 MB/s eta 0:00:01[K     |██▏                             | 51 kB 20.0 MB/s eta 0:00:01[K     |██▋                             | 61 kB 17.5 MB/s eta 0:00:01[K     |███                             | 71 kB 15.9 MB/s eta 0:00:01[K     |███▌                            | 81 kB 17.2 MB/s eta 0:00:01[K     |████                            | 92 kB 18.4 MB/s eta 0:00:01[K     |████▍                           | 102 kB 16.4 MB/s eta 0:00:01[K     |████▉                           | 112 kB 16.4 MB/s eta 0:00:01[K     |█████▎                          | 122 kB 16.4 MB/s eta 0:00:01[K     |█████▊                  

In [48]:
import pycrfsuite

In [49]:
%%time

trainer = pycrfsuite.Trainer(verbose = True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 2.47 s, sys: 64.6 ms, total: 2.53 s
Wall time: 3.07 s


In [50]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [51]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [52]:
%%time
trainer.train('resume-ner.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 54579
Seconds required: 0.301

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 296578.339271
Feature norm: 1.000000
Error norm: 199925.890694
Active features: 35793
Line search trials: 1
Line search step: 0.000004
Seconds required for this iteration: 2.015

***** Iteration #2 *****
Loss: 250133.594452
Feature norm: 6.577200
Error norm: 49243.545075
Active features: 35617
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 2.046

***** Iteration #3 *****
Loss: 207250.510062
Feature norm: 5.780854
Error norm: 49231.166362
Active features: 29984
Line search trials: 1
Line search step: 1.000000
Seconds required

In [53]:
trainer.logparser.last_iteration

{'active_features': 8087,
 'error_norm': 1031.582622,
 'feature_norm': 73.411204,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 10360.639459,
 'num': 100,
 'scores': {},
 'time': 1.008}

In [54]:
tagger = pycrfsuite.Tagger()
tagger.open('./resume-ner.crfsuite')

<contextlib.closing at 0x7f08325fa8d0>

#### Evaluation

In [55]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)

In [56]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 224 ms, sys: 3.02 ms, total: 227 ms
Wall time: 226 ms


In [62]:
X_test[0]

[{'+1:postag': 'NNP',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'sharma',
  'BOS': True,
  'bias': 1.0,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'avin',
  'word[-2:]': 'in',
  'word[-3:]': 'vin'},
 {'+1:postag': 'NNP',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'senior',
  '-1:postag': 'NNP',
  '-1:postag[:2]': 'NN',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'avin',
  'bias': 1.0,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'sharma',
  'word[-2:]': 'ma',
  'word[-3:]': 'rma'},
 {'+1:postag': 'NNP',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'associate',
  '-1:postag': 'NNP',


In [57]:
report, accuracy = ner_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
print(report)

                       precision    recall  f1-score   support

    B-Can Relocate to       1.00      1.00      1.00         1
    I-Can Relocate to       1.00      1.00      1.00         2
            B-College       0.00      0.00      0.00         4
            I-College       0.00      0.00      0.00         8
       B-College Name       0.65      0.62      0.63        39
       I-College Name       0.73      0.57      0.64       114
B-Companies worked at       0.88      0.52      0.66       111
I-Companies worked at       0.86      0.63      0.73       150
             B-Degree       0.89      0.76      0.82        42
             I-Degree       0.78      0.83      0.80        82
        B-Designation       0.90      0.69      0.78       106
        I-Designation       0.82      0.70      0.76       173
      B-Email Address       0.71      0.88      0.79        25
      I-Email Address       0.82      0.74      0.78        19
    B-Graduation Year       0.40      0.07      0.12  

In [59]:
print(accuracy)

0.9354202986432198
