## Setting up the work enviroment

In [3]:
# libraries
import json
import logging
import numpy as np
import os
import pickle
import plac
import random
import re
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import spacy
from spacy import displacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import sys
from tqdm import tqdm

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [5]:
# Versions
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("re version:", re.__version__)
print("Seaborn version:", sns.__version__)
print("spacy version:", spacy.__version__)

keras version: 2.2.4
nltk version: 3.4.5
numpy version: 1.18.1
pandas version: 1.0.1
plaidml version: 0.7.0
re version: 2.2.1
Seaborn version: 0.10.0
spacy version: 2.0.12


In [6]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./datasets/"

## Data Preprocessing

In [7]:
def convert_dataturks_to_spacy(JSON_FilePath):

    training_data = list()
    lines = list()
        
    with open(JSON_FilePath, "r") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data["content"]
        entities = []
        
        for annotation in data["annotation"]:
            #only a single point in text annotation.
            point = annotation["points"][0]
            labels = annotation["label"]
                
            # handle both list of labels or a single label.
            if not isinstance(labels, list):
                labels = [labels]

            for label in labels:
                #these indices are both inclusive [start, end] but spacy is not 
                # [start, end)
                #entities.append((point["start"], point["end"] + 1 ,label))
                entities.append((point["start"], point["end"] + 1 ,label))


        training_data.append((text, {"entities" : entities}))

    return training_data

In [2]:
TRAIN_DATA = convert_dataturks_to_spacy(directory + "cv_traindata.json")

NameError: name 'convert_dataturks_to_spacy' is not defined

In [1]:
TRAIN_DATA

NameError: name 'TRAIN_DATA' is not defined

# Preparing the model

In [10]:
%%time
# Creating a NLP object // Loading the model
#nlp = spacy.load("en_core_web_sm") # 11MB
#nlp = spacy.load("en_core_web_md") # 91MB
nlp = spacy.load("en_core_web_lg") # 789MB
#nlp = spacy.blank("en")

CPU times: user 6.58 s, sys: 810 ms, total: 7.39 s
Wall time: 7.71 s


In [11]:
# initializing the ner pipe, and adding it to the pipeline (if it's not there already)
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last = True)
    
else:
    ner = nlp.get_pipe("ner")

In [12]:
# adding labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [13]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

In [None]:
%%time
# Training
with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in range(10):
            print("Statring iteration " + str(itn))
            
            random.shuffle(TRAIN_DATA)
            
            losses = {}
            
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop = 0.2,  # dropout - make it harder to memorise data
                    sgd = optimizer,  # callable to update weights
                    losses = losses)
                
            print(losses)

  0%|          | 1/200 [00:00<00:23,  8.29it/s]

Statring iteration 0


100%|██████████| 200/200 [00:29<00:00,  6.79it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

{'ner': 22648.764003465345}
Statring iteration 1


100%|██████████| 200/200 [00:30<00:00,  6.58it/s]
  0%|          | 1/200 [00:00<00:38,  5.16it/s]

{'ner': 21429.838546635037}
Statring iteration 2


100%|██████████| 200/200 [00:29<00:00,  6.68it/s]
  0%|          | 1/200 [00:00<00:20,  9.66it/s]

{'ner': 21101.109041517266}
Statring iteration 3


 40%|████      | 80/200 [00:12<00:20,  5.93it/s]

In [None]:
#test the model and evaluate it
examples = convert_dataturks_to_spacy(directory + "cv_testdata.json")
tp=0
tr=0
tf=0

ta=0
c=0        

for text,annot in examples:
    f=open("resume"+str(c)+".txt","w")
    doc_to_test=nlp(text)
    d={}
    for ent in doc_to_test.ents:
        d[ent.label_]=[]
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)

    for i in set(d.keys()):

        f.write("\n\n")
        f.write(i +":"+"\n")
        for j in set(d[i]):
            f.write(j.replace('\n','')+"\n")
    d={}
    for ent in doc_to_test.ents:
        d[ent.label_]=[0,0,0,0,0,0]
    for ent in doc_to_test.ents:
        doc_gold_text= nlp.make_doc(text)
        gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
        y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
        y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
        if(d[ent.label_][0]==0):
            #f.write("For Entity "+ent.label_+"\n")   
            #f.write(classification_report(y_true, y_pred)+"\n")
            (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
            a=accuracy_score(y_true,y_pred)
            d[ent.label_][0]=1
            d[ent.label_][1]+=p
            d[ent.label_][2]+=r
            d[ent.label_][3]+=f
            d[ent.label_][4]+=a
            d[ent.label_][5]+=1
    c+=1
for i in d:
    print("\n For Entity "+i+"\n")
    print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
    print("Precision : "+str(d[i][1]/d[i][5]))
    print("Recall : "+str(d[i][2]/d[i][5]))
    print("F-score : "+str(d[i][3]/d[i][5]))