### Import Libraries

In [1]:
import random
from pathlib import Path
import spacy
from spacy.training import Example
import os, json, warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.blank('en')
ner = nlp.add_pipe('ner', last=True)

 # Disable all piplines except NER
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
# nlp.disable_pipes(*other_pipes)

### Import training dataset

In [3]:
f = open('Train_2022-02-16_01_07_24.json')
data = json.load(f)

In [4]:
TRAIN_DATA = []
for i, d in enumerate(data):
    text = data[i]['text']
    entities = []
    mapper = {}
    for ent in data[i]['ents']:
        key = f"{ent['start']}${ent['end']}${ent['label']}"
        if mapper.get(key, False) != True:
            mapper[key] = True
            entities.append((ent['start'], ent['end'], ent['label']))
    TRAIN_DATA.append((text, {
        'entities': entities
    }))

### Model Training

In [5]:
# Defining parameters for training
output_dir = Path(os.getcwd(), 'saved_model')
n_iter = 100

In [10]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

optimizer = nlp.begin_training()
for itn in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA: # tqdm(TRAIN_DATA) can be used to see the model training progress
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update(
            [example],
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print(losses)

{'ner': 1004.70881741622}
{'ner': 534.55289473207}
{'ner': 443.0953762944556}
{'ner': 395.99133800218505}
{'ner': 330.59544258465917}
{'ner': 329.94082053046293}
{'ner': 523.957489516199}
{'ner': 247.3192374596662}
{'ner': 244.9045927527701}
{'ner': 205.255995108754}
{'ner': 201.78153158291022}
{'ner': 174.82024450608515}
{'ner': 157.0076633480524}
{'ner': 150.78791096416722}
{'ner': 139.2540910372134}
{'ner': 142.5756446238851}
{'ner': 117.9162394821336}
{'ner': 108.13154696229878}
{'ner': 105.14351050460984}
{'ner': 100.6451323930089}
{'ner': 86.82780310918668}
{'ner': 77.48998784476476}
{'ner': 83.57492302858128}
{'ner': 79.65592163566761}
{'ner': 53.06792261424719}
{'ner': 178.10974815218736}
{'ner': 75.98250230605782}
{'ner': 66.66531527595286}
{'ner': 63.05211813965246}
{'ner': 50.38186690904505}
{'ner': 62.514723458766234}
{'ner': 52.48319019483163}
{'ner': 43.31216586686776}
{'ner': 60.0803628568895}
{'ner': 33.09192544513113}
{'ner': 59.31353413201182}
{'ner': 39.8537197756249

### Model Validation

In [7]:
for text, _ in TRAIN_DATA:
    print(text, _, sep='****\n')
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Blacksmith Solution  Excellent Python (NumPy, pandas, scikit-learn, etc.) or R programming skills, familiar with open source libraries and tools for data science Track record of speedily and rigorously developing and deploying machine learning models to resolve industry problems Strong SQL skills Superior communication and data visualization skills Experience working in an agile development environment****
{'entities': [(0, 19, 'ORG'), (31, 37, 'SKILL'), (39, 44, 'SKILL'), (46, 52, 'SKILL'), (54, 66, 'SKILL'), (77, 90, 'SKILL'), (149, 161, 'SKILL'), (227, 243, 'SKILL'), (287, 290, 'SKILL'), (325, 343, 'SKILL')]}
Entities [('Blacksmith Solution', 'ORG'), ('Python', 'SKILL'), ('NumPy', 'SKILL'), ('pandas', 'SKILL'), ('scikit-learn', 'SKILL'), ('R programming', 'SKILL'), ('data science', 'SKILL'), ('machine learning', 'SKILL'), ('SQL', 'SKILL'), ('data visualization', 'SKILL')]
Tokens [('Blacksmith', 'ORG', 3), ('Solution', 'ORG', 1), (' ', '', 2), ('Excellent', '', 2), ('Python', 'SKILL'

In [8]:
text = TRAIN_DATA[1][0]
entities_list = TRAIN_DATA[1][1]
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Bosch', 'ORG'), ('Python', 'SKILL'), ('pandas', 'SKILL'), ('sklearn', 'SKILL'), ('numpy', 'SKILL'), ('scipy', 'SKILL'), ('matplotlib', 'SKILL'), ('Spark MLlib', 'SKILL'), ('Spark', 'SKILL'), ('Hadoop', 'SKILL'), ('Kafka', 'SKILL'), ('Hive', 'SKILL'), ('SQL', 'SKILL'), ('machine learning', 'SKILL'), ('k-NN', 'SKILL'), ('Naive Bayes', 'SKILL'), ('SVM', 'SKILL'), ('Decision Forests', 'SKILL'), ('Python', 'SKILL'), ('GIT', 'SKILL'), ('Python', 'SKILL'), ('Web2Py', 'SKILL'), ('Django/', 'SKILL'), ('Flask', 'SKILL'), ('D3.js', 'SKILL'), ('GGplot', 'SKILL'), ('Selecting features', 'SKILL'), ('optimizing classifiers', 'SKILL'), ('machine learning', 'SKILL'), ('Data mining', 'SKILL'), ('detection systems', 'SKILL'), ('Model Deployments', 'SKILL')]


### Save Model

In [17]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to D:\Nandeesh\Practice\GitHub Files\practice-files\Volunteered Projects\JD Matcher\saved_model


### Test the saved model

In [18]:
# print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
text = TRAIN_DATA[1][0]
entities_list = TRAIN_DATA[1][1]
doc = nlp2(text)
skills = {ent.text for ent in doc.ents if ent.label_ == 'SKILL'}
print(text, skills, sep='\n\n')

Blacksmith Solution  Excellent Python (NumPy, pandas, scikit-learn, etc.) or R programming skills, familiar with open source libraries and tools for data science Track record of speedily and rigorously developing and deploying machine learning models to resolve industry problems Strong SQL skills Superior communication and data visualization skills Experience working in an agile development environment

{'R programming', 'pandas', 'data visualization', 'SQL', 'NumPy', 'scikit-learn', 'machine learning', 'Python', 'data science'}
