In [1]:
"""
Transformer Network Application: Resume Named-Entity Recognition (NER)
"""
from utils_ner import *
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import DistilBertTokenizerFast 
from transformers import TFDistilBertForTokenClassification
from tqdm.notebook import tqdm
import logging
import re
import json
from seqeval.metrics import classification_report

# Set TensorFlow logger to only display ERROR messages
tf.get_logger().setLevel('ERROR')

In [2]:
# GPU SETTINGS
# TensorFlow configuration to efficiently manages GPU memory by restricting 
# the default behavior of allocating all available GPU memory, ensuring fair
# resource sharing and avoiding out-of-memory (OOM) errors. 

configure_gpu_memory(memory_limit=4096, config_option=False)

No GPUs found. Configuration skipped.


In [3]:
# DATA CLEANING
# A look at the data
ner_json_path = os.path.join(os.getcwd(), "data", "ner.json")
df_data = pd.read_json(ner_json_path, lines=True) # 'lines=True' Each line is treated as an independent JSON object.
df_data.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [4]:
df_data = df_data.drop(['extras'], axis=1)
df_data['content'] = df_data['content'].str.replace("\n", " ")

In [5]:
df_data.iloc[0]['annotation']

[{'label': ['Skills'],
  'points': [{'start': 1295,
    'end': 1621,
    'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]},
 {'label': ['Skills'],
  'points': [{'start': 993,
    'end': 1153,
    'text': 'C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1 year)'}]},
 {'label': ['College Name'],
  'points': [{'start': 939, 'end': 956, 'text': 'Kendriya Vidyalaya'}]},
 {'label': ['College Name'],
  'points': [{'start': 883, 'end': 904, 'text': 'Woodbine modern school'}]},
 {'label': ['Graduation Year'],
  'points': [{'start': 856, 'end': 860, 'text': '2017\n'}]},
 {'label': ['College 

In [6]:
df_data['entities'] = get_entities(df_data)
df_data.head()

Unnamed: 0,content,annotation,entities
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12...","[(0, 12, Name), (13, 46, Designation), (49, 58..."
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta...","[(0, 14, Name), (62, 68, Location), (104, 148,..."
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37...","[(0, 21, Name), (22, 31, Location), (65, 117, ..."
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80...","[(0, 12, Name), (13, 51, Designation), (54, 60..."
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20...","[(0, 13, Name), (14, 22, Designation), (24, 41..."


In [7]:
# Convert DataTurks to SpaCy for training
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    """
    Converts Dataturks JSON data into a format compatible with SpaCy training.
    
    This function processes annotated data from a Dataturks JSON file and transforms it 
    into a SpaCy-compatible format for Named Entity Recognition (NER) training. The output 
    includes a list of tuples, where each tuple contains a text string and its corresponding 
    entity annotations.

    Arguments:
        dataturks_JSON_FilePath (str): The file path to the Dataturks JSON file. This file 
                                       should contain annotations for training in JSON format.

    Returns:
        list: A list of tuples, where each tuple consists of:
              - text (str): The annotated text.
              - dict: A dictionary with a key "entities", which maps to a list of tuples.
                      Each tuple represents an entity and includes:
                        - start (int): Start index of the entity in the text.
                        - end (int): End index of the entity in the text (exclusive).
                        - label (str): The label/category of the entity.

    Exception Handling:
        In case of any error during file reading or processing, logs the exception 
        and returns None.

    Example Output:
        [
            ("John works at Microsoft.", {"entities": [(0, 4, "PERSON"), (14, 23, "ORGANIZATION")]}),
            ("Paris is beautiful.", {"entities": [(0, 5, "LOCATION")]}),
            ("Jacob lives in New York.", {"entities": [(0, 4, "PERSON"), (14, 22, "LOCATION")]})
        ]

    Steps:
        1. Open the JSON file and read each line.
        2. Parse the JSON content to extract text and annotations.
        3. Process each annotation to extract the entity label, start, and end positions.
        4. Handle whitespace adjustments using `lstrip()` and `rstrip()` to ensure accurate indexing.
        5. Append the processed text and entity annotations to the training data list.
        6. Return the final training data list in SpaCy format.
    """
    try:
        training_data = []  # List to store processed training data
        lines = []

        # Step 1: Read lines from the Dataturks JSON file
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        # Step 2: Process each line in the JSON file
        for line in lines:
            data = json.loads(line)  # Parse the JSON line
            text = data['content'].replace("\n", " ")  # Replace newlines with spaces in the text
            entities = []  # List to store entities for the current text
            data_annotations = data['annotation']

            # Step 3: Process annotations if they exist
            if data_annotations is not None:
                for annotation in data_annotations:
                    # Process a single point in the text annotation
                    point = annotation['points'][0]
                    labels = annotation['label']

                    # Handle both a single label or a list of labels
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']

                        # Adjust start and end indices for leading/trailing whitespace
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1, label))  # Adjusted end is exclusive
            
            # Step 4: Add text and entities to training data
            training_data.append((text, {"entities": entities}))

        # Step 5: Return the final training data
        return training_data

    except Exception as e:
        # Log the exception and return None
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [8]:
data_spacy = convert_dataturks_to_spacy(ner_json_path)

In [9]:
pd.DataFrame(data_spacy).head(5)

Unnamed: 0,0,1
0,Abhishek Jha Application Development Associate...,"{'entities': [(1296, 1622, 'Skills'), (993, 11..."
1,Afreen Jamadar Active member of IIIT Committee...,"{'entities': [(1155, 1199, 'Email Address'), (..."
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","{'entities': [(3749, 3757, 'Skills'), (3709, 3..."
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"{'entities': [(8098, 8384, 'Skills'), (8008, 8..."
4,Ananya Chavan lecturer - oracle tutorials Mum...,"{'entities': [(2010, 2013, 'Degree'), (973, 17..."


In [None]:
# Cleans entity spans to remove leading and trailing whitespace
def trim_entity_spans(data: list) -> list:
    """
    Cleans entity spans by removing leading and trailing whitespace in the annotated text.

    This function processes a list of annotated text data in spaCy JSON format. For each entity span,
    it ensures that any leading or trailing whitespace in the text does not affect the start and end
    positions of the entity annotations. The adjusted spans are then added back to the cleaned data.

    Arguments:
        data (list): A list of annotated text data in spaCy JSON format. Each element should be a tuple
                     containing:
                     - text (str): The annotated text.
                     - annotations (dict): A dictionary with key "entities", mapping to a list of entity spans.
                       Each entity span should be a tuple of (start, end, label).

    Returns:
        list: A cleaned list of annotated text data in spaCy JSON format. Each element is a tuple containing:
              - text (str): The original text.
              - annotations (dict): A dictionary with key "entities", mapping to a list of adjusted entity spans.
                Each adjusted entity span is a list with:
                  - valid_start (int): The corrected start index of the entity in the text.
                  - valid_end (int): The corrected end index of the entity in the text (exclusive).
                  - label (str): The entity label.

    Example Input:
        data = [("  John Doe is a developer. ", {"entities": [(2, 10, "PERSON")]})]

    Example Output:
        [("  John Doe is a developer. ", {"entities": [(2, 9, "PERSON")]})]

    Functionality:
        1. Compiles a regex pattern (`r'\s'`) to match whitespace characters.
        2. Iterates over each text and its associated annotations in the input data.
        3. For each entity, adjusts the `start` and `end` positions to exclude leading and trailing whitespace.
        4. Appends the corrected entities and text back to the cleaned data list.
        5. Returns the cleaned data.

    Note:
        This function ensures that whitespace does not interfere with entity span indices, which is
        critical for accurate training in spaCy-based Named Entity Recognition (NER) pipelines.
    """

    invalid_span_tokens = re.compile(r"\s")  # Pattern to identify whitespace

    cleaned_data = []  # List to store the cleaned data
    for text, annotations in data:
        entities = annotations['entities']  # Extract entities for the current text
        valid_entities = []  # List to store valid (corrected) entities
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            
            # Adjust the start position to skip leading whitespace
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            # Adjust the end position to skip trailing whitespace
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            # Append the corrected entity
            valid_entities.append([valid_start, valid_end, label])
        # Append the cleaned text and entities to the result
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [11]:
data = trim_entity_spans(data_spacy)

In [12]:
pd.DataFrame(data).head(5)

Unnamed: 0,0,1
0,Abhishek Jha Application Development Associate...,"{'entities': [[1296, 1622, 'Skills'], [993, 11..."
1,Afreen Jamadar Active member of IIIT Committee...,"{'entities': [[1155, 1199, 'Email Address'], [..."
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","{'entities': [[3749, 3757, 'Skills'], [3709, 3..."
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"{'entities': [[8098, 8384, 'Skills'], [8008, 8..."
4,Ananya Chavan lecturer - oracle tutorials Mum...,"{'entities': [[2010, 2013, 'Degree'], [973, 17..."


In [13]:
# Clean data: associate entities to words in the sequence
def clean_dataset(data):
    """
    Processes a dataset of text and associated entity annotations to produce cleaned output.

    This function iterates through text data and entity annotations, identifies word boundaries,
    and associates entities with the corresponding words. If a word does not have an entity, it
    is marked with "Empty". The cleaned data is returned in a pandas DataFrame format.

    Arguments:
        data (list): A list of tuples, where each tuple consists of:
            - data[i][0] (str): A text string to be processed.
            - data[i][1] (dict): A dictionary where keys are entity labels and values are lists 
              containing entity spans. Each span is a list [start_index, end_index, label].

    Returns:
        pandas.DataFrame: A DataFrame with a single column, "sentences_cleaned", containing a list
        for each text entry. Each list has:
            - Entity labels: Corresponding labels for words tagged with entities.
            - "Empty": For words without associated entities.

    Example Input:
        data = [
            ("John Doe lives in Philadelphia.", 
             {"PERSON": [[0, 8, "John Doe"]], "LOCATION": [[18, 31, "Philadelphia"]]}),
            ("Apple is a technology company.", {})
        ]

    Example Output:
        cleanedDF =
            setences_cleaned
        0   ["PERSON", "Empty", "Empty", "Empty", "LOCATION"]
        1   ["Empty", "Empty", "Empty", "Empty", "Empty"]

    Functionality:
        1. Initializes an empty DataFrame to store cleaned data.
        2. Iterates over the dataset using a progress bar for large datasets.
        3. Breaks text into individual words and checks each word's span to associate entities.
        4. Handles the last word in each text separately to ensure accurate annotation.
        5. Appends processed annotations to the DataFrame.
        6. Tracks the total number of processed words for monitoring purposes.

    Notes:
        - The "Empty" placeholder ensures every word has an annotation, even if no entities are present.
        - Uses `tqdm` to display a progress bar for processing.

    Returns:
        A pandas DataFrame containing the cleaned word/entity association data for each text entry.
    """
    cleanedDF = pd.DataFrame(columns=["sentences_cleaned"])  # Initialize empty DataFrame
    sum1 = 0  # Total number of words processed
    
    # Iterate over the dataset with progress bar
    for ii in tqdm(range(len(data))):
        start = 0  # Start index for the current word
        emptyList = ["Empty"] * len(data[ii][0].split())  # Placeholder list for word annotations
        numberOfWords = 0  # Count of processed words
        lenOfString = len(data[ii][0])  # Length of the current text
        strData = data[ii][0]  # Extract the text string
        strDictData = data[ii][1]  # Extract entity annotations as dictionary
        lastIndexOfSpace = strData.rfind(' ')  # Find the last space in the text
        
        # Iterate through characters in the text
        for i in range(lenOfString):
            # Handle word boundaries identified by spaces
            if strData[i] == " " and strData[i + 1] != " ":  # Detects the end of a word
                for k, v in strDictData.items():  # Iterate through entity dictionary
                    for j in range(len(v)):  # Check spans for overlapping entities
                        entList = v[len(v) - j - 1]  # Process entity from the end of the list
                        # Checks if current word (defined by start and i) overlaps with entity span (entList[0] to entList[1]).
                        if start >= int(entList[0]) and i <= int(entList[1]):  
                            emptyList[numberOfWords] = entList[2]  # Assign entity label
                            break
                        else:
                            continue
                start = i + 1  # Update start index to the next word
                numberOfWords += 1  # Increment word count
            
            # Handle the last word in the text
            if i == lastIndexOfSpace:
                for j in range(len(v)):
                    entList = v[len(v) - j - 1]  # Process entity from the end of the list
                    if lastIndexOfSpace >= int(entList[0]) and lenOfString <= int(entList[1]):
                        emptyList[numberOfWords] = entList[2]  # Assign entity label
                        numberOfWords += 1
        
        # Append processed data to the DataFrame
        new_row = pd.DataFrame([[emptyList]], columns=cleanedDF.columns)
        cleanedDF = pd.concat([cleanedDF, new_row], ignore_index=True)
        sum1 = sum1 + numberOfWords  # Update total word count
    
    return cleanedDF

In [14]:
cleaned_DF = clean_dataset(data)

HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))




In [15]:
cleaned_DF.head()

Unnamed: 0,sentences_cleaned
0,"[Name, Name, Designation, Designation, Designa..."
1,"[Name, Name, Empty, Empty, Empty, Empty, Empty..."
2,"[Name, Name, Name, Empty, Empty, Empty, Empty,..."
3,"[Name, Name, Designation, Designation, Designa..."
4,"[Name, Name, Designation, Empty, Companies wor..."


In [16]:
# GENERATE UNIQUE TAGS AND PAD SEQUENCE
# Extract unique tags from the 'sentences_cleaned' column of the DataFrame
unique_tags = set(cleaned_DF['sentences_cleaned'].explode().unique())

# Map each tag to a unique numerical ID
tag2id = {tag: id for id, tag in enumerate(unique_tags)}

# Create a reverse mapping from IDs back to tags
id2tag = {id: tag for tag, id in tag2id.items()}

In [17]:
unique_tags

{'College Name',
 'Companies worked at',
 'Degree',
 'Designation',
 'Email Address',
 'Empty',
 'Graduation Year',
 'Location',
 'Name',
 'Skills',
 'UNKNOWN',
 'Years of Experience'}

In [18]:
# Find the maximum length across all entries in DF
max_length = cleaned_DF['sentences_cleaned'].apply(len).max()
print("Max sequence length:", max_length)  

Max sequence length: 2953


In [19]:
# Padding to ensure all input sequences have the same length. Sequences exceeding 
# the maximum length that is set will be truncated and shorter ones will be padded.
MAX_LEN = 512  # initialize maximum length
labels = cleaned_DF['sentences_cleaned'].values.tolist()
print(labels[0])

['Name', 'Name', 'Designation', 'Designation', 'Designation', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Email Address', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Designation', 'Designation', 'Empty', 'Companies worked at', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Empty', 'Designation', 'Designation', '

In [20]:
tag2id

{'Graduation Year': 0,
 'Skills': 1,
 'Companies worked at': 2,
 'College Name': 3,
 'Years of Experience': 4,
 'Name': 5,
 'Location': 6,
 'Degree': 7,
 'Email Address': 8,
 'Designation': 9,
 'Empty': 10,
 'UNKNOWN': 11}

In [21]:
# Padding sequences
# Create an empty list to store the converted sequences of tags
converted_labels = []

# Step 1: Iterate through each sequence of labels in 'labels'
for label in labels:
    current_sequence = []  # List to store the converted tags for the current sequence

    # Step 2: Iterate through each label in the current sequence
    for lab in label:
        # Convert the label to its corresponding ID using the 'tag2id' dictionary
        tag_id = tag2id.get(lab)
        current_sequence.append(tag_id)  # Add the ID to the current sequence

    # Append the fully converted sequence (list of IDs) to 'converted_labels'
    converted_labels.append(current_sequence)

# Step 3: Pad all the converted sequences to make them of uniform length
tags = pad_sequences(
    converted_labels,       # Nested list of tag IDs
    maxlen=MAX_LEN,         # Maximum length for padding or truncation
    value=tag2id["Empty"],  # Value to use for padding
    padding="post",         # Add padding at the end of the sequence
    dtype="long",           # Data type of the resulting array
    truncating="post"       # Truncate sequences that are too long from the end
)

tags

array([[ 5,  5,  9, ..., 10, 10, 10],
       [ 5,  5, 10, ..., 10, 10, 10],
       [ 5,  5,  5, ..., 10,  1, 10],
       ...,
       [ 5,  5,  9, ..., 10, 10, 10],
       [ 5,  5,  9, ..., 10, 10, 10],
       [ 5,  5,  9, ..., 10, 10, 10]])

In [22]:
# TOKENIZE AND ALIGN LABELS WITH LIBRARY
def tokenize_and_align_labels(tokenizer, examples, tags, max_tokens=512, label_all_tokens=True):
    """
    Tokenizes input text and aligns word-level labels with tokenized subwords.

    This function processes raw text data using a tokenizer, ensuring the resulting 
    subword tokens are correctly aligned with their corresponding word-level labels (tags). 
    It handles special tokens, subword alignment, and padding/truncation based on predefined parameters.

    Arguments:
        tokenizer: 
            A tokenizer object (e.g., Hugging Face tokenizer) used to process raw text into tokens.
        examples (list of str): 
            A list of raw text sequences to be tokenized.
        tags (list of list): 
            A list of word-level label sequences corresponding to `examples`.
            Each element is a list of labels for the words in the text.
        max_tokens (int): Maximum length of tokenized input. Default = 512
        label_all_tokens (bool): Boolean to label all tokens. Default is True
        
    Returns:
        dict: 
            A dictionary containing tokenized input data and aligned labels:
            - 'input_ids': Token IDs for the sequences.
            - 'attention_mask': Attention masks for distinguishing padding from actual tokens.
            - 'labels': Aligned label sequences, where special tokens are assigned a label of -100.

    Function Steps:
        1. Tokenize input text using the provided tokenizer.
        2. Retrieve word IDs to map tokens back to their original words.
        3. Align labels with tokens:
           - Assign `-100` to special tokens (e.g., [CLS], [SEP]) to exclude them from the loss computation.
           - Assign word-level labels to the first subword of each tokenized word.
           - Depending on the `label_all_tokens` flag, assign labels to all subwords or use `-100` for non-first subwords.
        4. Return tokenized inputs along with aligned labels.

    Example:
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        examples = ["John lives in New York."]
        tags = [[2, 0, 0, 1]]  # Labels for each word in the sequence
        tokenized_data = tokenize_and_align_labels(tokenizer, examples, tags)
        
        Output:
        {
            'input_ids': [101, ..., 102],          # Token IDs for the text
            'attention_mask': [1, ..., 0],         # Mask to indicate valid tokens
            'labels': [-100, 0, ..., -100]         # Aligned label sequence
        }
    """
    
    # Tokenize input text
    tokenized_inputs = tokenizer(
        examples, 
        truncation=True,
        is_split_into_words=False,  # Assumes each input is a raw text sequence, not already split into words.
        padding='max_length',  # Adds padding to ensure all sequences have the same length (e.g., 512 tokens).
        max_length=max_tokens  # Maximum length for tokenized sequences.
    )
    
    labels = []
    for i, label in enumerate(tags):
        # Align word-level labels (tags) with tokenized subwords
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # Special tokens have a word id that is None. Assign -100 to exclude them from loss computation.
            if word_idx is None:
                label_ids.append(-100)
            # Assign label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # Assign label for subwords based on label_all_tokens flag.
            else:
                # Handling subword token: Subwords of the same word (i.e., word_idx == previous_word_idx)
                # This applies to tokens that are part of a word but are not the first token (e.g., subwords
                # produced by tokenizers like BERT).
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    # Add aligned labels to the tokenized input dictionary
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [23]:
# Load the pretrained tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize and align labels
tk_inputs = tokenize_and_align_labels(tokenizer, df_data['content'].values.tolist(), tags, max_tokens=MAX_LEN)
train_dataset = tf.data.Dataset.from_tensor_slices((tk_inputs['input_ids'],
                                                    tk_inputs['labels']))
print(train_dataset.take(1))

<_TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.int32, name=None))>


In [24]:
train_dataset.cardinality().numpy()

220

In [25]:
for example in train_dataset.take(1):  # Retrieve one example
    input_ids, labels = example
    print("Input IDs:", input_ids.numpy())
    print("Labels:", labels.numpy())
    break

Input IDs: [  101 11113 24158  5369  2243  1046  3270  4646  2458  5482  1011  9669
  5397  8191 14129  1010 12092  1011 10373  2033  2006  5262  1024  5262
  1012  4012  1013  1054  1013 11113 24158  5369  2243  1011  1046  3270
  1013  2184  2063  2581  2050  2620 27421  2581 16703  9818 23777  2050
  1528  2000  2147  2005  2019  3029  2029  3640  2033  1996  4495  2000
  5335  2026  4813  1998  3716  2005  2026  3265  1998  2194  1005  1055
  3930  1999  2190  2825  3971  1012  5627  2000 20102  2000  1024 14022
  1010 12092  2147  3325  4646  2458  5482  9669  5397  1011  2281  2418
  2000  2556  2535  1024  2747  2551  2006 11834  1011 28516  1012  4975
  2067 10497 14721  7243 15794 10861  5134  2005  1996 28516  2029  2097
  2022 13330  2241  2006  2445  7953  1012  2036  1010  2731  1996 28516
  2005  2367  2825 14395 26755  1006  2119  3893  1998  4997  1007  1010
  2029  2097  2022  2445  2004  7953  2011  1996  5310  1012  2495  1038
  1012  1041  1999  2592  2671  1998  33

In [26]:
# OPTIMIZATION
# Optimizing DistilBERT model to match the tokenizer to preprocess your data.
# Model
model = TFDistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_tags))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [27]:
# Compile model
from tf_keras.optimizers.legacy import Adam
optimizer = Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy']) # can also use any keras loss fn

In [28]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 4
BUFFER_SIZE = int(train_dataset.cardinality().numpy()) 

# Properly process the dataset
train_dataset = train_dataset.shuffle(BUFFER_SIZE)  # Shuffle the dataset
train_dataset = train_dataset.batch(BATCH_SIZE)     # Batch the dataset
train_dataset = train_dataset.prefetch(AUTOTUNE)    # Add AUTOTUNE for efficient data loading

# Train the model
model.fit(train_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x30598eba0>

In [39]:
# Define the callback to save the best model based on training loss
from tf_keras.callbacks import ModelCheckpoint, EarlyStopping
check_point = ModelCheckpoint(
    filepath='best_model.h5',  # Path to save the model
    monitor='loss',            # Monitor the training loss instead of validation loss
    save_best_only=True,       # Save only when the monitored metric improves
    save_weights_only=True,    # Save weights only
    mode='min',                # Minimize the loss (the lower, the better)
    verbose=1                  # Print progress messages
)

early_stopping = EarlyStopping(monitor='loss', patience=5, verbose=1, mode='min', restore_best_weights=True)

# Model training with the callback
model.fit(train_dataset, epochs=10, callbacks=[check_point, early_stopping])

Epoch 1/10
Epoch 1: loss improved from inf to 0.21141, saving model to best_model.h5
Epoch 2/10
Epoch 2: loss improved from 0.21141 to 0.19997, saving model to best_model.h5
Epoch 3/10
Epoch 3: loss improved from 0.19997 to 0.19578, saving model to best_model.h5
Epoch 4/10
Epoch 4: loss improved from 0.19578 to 0.19363, saving model to best_model.h5
Epoch 5/10
Epoch 5: loss did not improve from 0.19363
Epoch 6/10
Epoch 6: loss improved from 0.19363 to 0.18754, saving model to best_model.h5
Epoch 7/10
Epoch 7: loss improved from 0.18754 to 0.17871, saving model to best_model.h5
Epoch 8/10
Epoch 8: loss improved from 0.17871 to 0.17280, saving model to best_model.h5
Epoch 9/10
Epoch 9: loss did not improve from 0.17280
Epoch 10/10
Epoch 10: loss did not improve from 0.17280
Restoring model weights from the end of the best epoch: 8.


<tf_keras.src.callbacks.History at 0x326596690>

Training takes time. Let's pause and assume that the model has been fully trained...
Let's write codes to make predictions on a trained model.

In [40]:
# PREDICTIONS
text = "Moe Ali, 5y Application Developer at Apple. Saurashtra University graduate. Email: moe-ali@me.com. Location: Philadelphia. Skills: problem solving (Less than 1 year), project lifecycle (Less than 1 year), project manager (Less than 1 year), technical assistance (Less than 1 year)."
text_inputs = tokenizer(text, return_tensors="tf", truncation=True, is_split_into_words=False, padding="max_length", max_length=MAX_LEN)
input_ids = text_inputs["input_ids"]

In [41]:
text_logits = model(text_inputs).logits
text_prediction = np.argmax(text_logits, axis=-1)

# Convert token IDs to readable words
tokens = tokenizer.convert_ids_to_tokens(text_inputs["input_ids"][0])
print(text_prediction)

[[ 5  5  5  5 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10  1 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10  7 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10  7 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10  1 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
  10 10 10 10 10 10 10 10 10 10 10 10 

In [42]:
text_ner_extract = extract_named_entities(text_prediction, id2tag, tokens, output_filename="ner_extracted_data.txt")

Filtered NER extracted data saved to ner_extracted_data.txt


In [43]:
model(text_inputs)

TFTokenClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 512, 12), dtype=float32, numpy=
array([[[-0.4998773 ,  0.33691654, -0.4542086 , ...,  0.14848809,
          0.7091168 , -0.28086126],
        [-0.7556005 , -0.189406  , -0.41208318, ...,  0.9605484 ,
          0.01034614, -0.776591  ],
        [-0.67618185, -0.5233857 , -0.4622652 , ...,  1.0638667 ,
         -0.07557663, -0.6665334 ],
        ...,
        [-0.39389208,  1.3292242 , -0.2599512 , ..., -0.37116396,
          4.3841424 , -0.443672  ],
        [-0.29335445,  1.659098  , -0.39312083, ..., -0.7903069 ,
          4.6886377 , -0.23070002],
        [-0.12853125,  1.822825  , -0.5919449 , ..., -0.7053789 ,
          4.423382  , -0.28691804]]], dtype=float32)>, hidden_states=None, attentions=None)

In [44]:
pred_labels, true_labels = [], []
for i in range(len(tk_inputs['labels'])):
    current_true_labels = []
    for true_index in tk_inputs['labels'][i]:
        current_true_labels.append(id2tag.get(true_index, "Empty"))
    true_labels.append(current_true_labels)
np.array(true_labels).shape

(220, 512)

In [45]:
output = model.predict(train_dataset)



In [46]:
m = len(tk_inputs['labels'])  # Number of samples
num_classes = len(unique_tags)  # Number of classes
predictions = np.argmax(output['logits'], axis=-1)

In [47]:
pred_labels = [[id2tag.get(index, "Empty") for index in predictions[i]] for i in range(len(predictions))]

In [48]:
import warnings

# Suppress specific warnings from seqeval
warnings.filterwarnings("ignore", message=".*seems not to be NE tag.*")

print(classification_report(true_labels, pred_labels, zero_division=0))

                    precision    recall  f1-score   support

            NKNOWN       0.00      0.00      0.00         1
               ame       0.19      0.19      0.19       220
ears of Experience       0.00      0.00      0.00        37
             egree       0.00      0.00      0.00       144
        esignation       0.02      0.01      0.02       430
             kills       0.05      0.06      0.05      4704
      mail Address       0.00      0.00      0.00        76
              mpty       0.93      0.94      0.93    103155
           ocation       0.00      0.00      0.00        73
       ollege Name       0.00      0.00      0.00       214
ompanies worked at       0.00      0.00      0.00       470
    raduation Year       0.00      0.00      0.00        58

         micro avg       0.88      0.89      0.88    109582
         macro avg       0.10      0.10      0.10    109582
      weighted avg       0.87      0.89      0.88    109582



Note: This completes the Resume Named-entity Recognition (NER) to help process resumes. Additional training is needed before feeding your input into the transformer model.