This notebooks simply converts the manually labelled module dataset to spaCy format. The output file is in repo/Backend/Data/NER_annotated_data/Job_Mod_Descriptions/mod_annotations.spacy

In [1]:
import spacy
from spacy.tokens import DocBin
import json
import random

In [2]:
# Converting JSON dataset manually into spaCy doc bin format
with open("./../../../Data/NER_annotated_data/Job_Mod_Descriptions/mod_annotations_1_to_70.json", "r") as f:
    data = json.load(f)

In [3]:
data

{'classes': ['SKILL'],
 'annotations': [['This module introduces the fundamental concepts of problem solving by computing and programming using an imperative programming language. It is the first and foremost introductory course to computing. Topics covered include computational thinking and computational problem solving, designing and specifying an algorithm, basic problem formulation and problem solving approaches, program development, coding, testing and debugging, fundamental programming constructs (variables, types, expressions, assignments, functions, control structures, etc.), fundamental data structures (arrays, strings, composite data types), basic sorting, and recursion.\r\n',
   {'entities': [[224, 246, 'SKILL'],
     [251, 280, 'SKILL'],
     [282, 319, 'SKILL'],
     [321, 346, 'SKILL'],
     [351, 377, 'SKILL'],
     [379, 398, 'SKILL'],
     [400, 406, 'SKILL'],
     [408, 429, 'SKILL'],
     [431, 465, 'SKILL'],
     [549, 576, 'SKILL'],
     [618, 631, 'SKILL'],
     [

In [4]:
nlp = spacy.blank("en")
doc_bin = DocBin()
skipped = 0

In [5]:
all_ents = []
for i in range(len(data['annotations'])):
    if not data['annotations'][i]: continue
    text, annotations = data['annotations'][i]
    text = text.strip()
    doc = nlp.make_doc(text.strip())
    ents = []
    for start, end, label in annotations["entities"]:
        span = doc.char_span(start, end, label=label)
        while span is None and start >= 0:
            start -= 1
            end -= 1
            span = doc.char_span(start, end, label=label)
        if span is None:
            skipped += 1
            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
            print(msg)
        else:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
    all_ents.append(ents)

In [6]:
# Converting JSON dataset manually into spaCy doc bin format
with open("./../../../Data/NER_annotated_data/Job_Mod_Descriptions/mod_annotations_71_to_150.json", "r") as f:
    data = json.load(f)

In [7]:
all_ents = []
for i in range(len(data['annotations'])):
    if not data['annotations'][i]: continue
    text, annotations = data['annotations'][i]
    text = text.strip()
    doc = nlp.make_doc(text.strip())
    ents = []
    for start, end, label in annotations["entities"]:
        span = doc.char_span(start, end, label=label)
        while span is None and start >= 0:
            start -= 1
            end -= 1
            span = doc.char_span(start, end, label=label)
        if span is None:
            skipped += 1
            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
            print(msg)
        else:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
    all_ents.append(ents)

In [8]:
skipped

0

In [9]:
len(doc_bin)

146

In [10]:
doc_bin.to_disk("./../../../Data/NER_annotated_data/Job_Mod_Descriptions/mod_annotations.spacy")

In [11]:
all_skills = [item for sublist in all_ents for item in sublist]
distinct_skills = []
for skill in all_skills:
    skill = skill.text
    if skill not in distinct_skills:
        distinct_skills.append(skill)
distinct_skills.sort()

In [12]:
distinct_skills

["'artificial intelligence'",
 '),',
 ').',
 ',',
 '1-D design',
 '1D-CNNs',
 '3D user interaction',
 'AI',
 'AMPL',
 'ANOVA',
 'APIs',
 'Analysis of potential concurrency',
 'Android app',
 'Applied Operations Research',
 'Artificial Intelligence.',
 'Artificial intelligence',
 'Automatic methods of Information Retrieval',
 'Basic OpenGL',
 'Bitcoin',
 'Business Analytics',
 'Business Analytics Applications and Issues',
 'Business Forecasting',
 'C++',
 'C.',
 'C/C++',
 'CNN',
 'CRUD',
 'Computer Science',
 'Contract & Agency Law',
 'Convolutional Neural Network',
 'Counting methods',
 'Create, Read, Update and Destroy',
 'DDL',
 'DW/BI applications',
 'DW/BI systems.',
 'Data Definition Language',
 'Data Mining',
 'Data mining',
 'Database Management Systems',
 'Decentralized blockchain-based systems',
 'ESP32',
 'Essentials of Financial Management',
 'Ethernet',
 'Euclidean algorithm',
 'Excel',
 'Extended Reality',
 'Extended Reality.',
 'FDDI',
 'Financial and Managerial Accountin