In [None]:
!pip install textract -q

In [3]:
import json
import pickle
import pandas as pd

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
        
def pickle_load(file):
    
    data = pickle.load(open(file,'rb'))
    return data

data = load_data("700data_3.json")

In [3]:
import spacy
nlp=spacy.load('en_core_web_sm')

ner=nlp.get_pipe("ner")

In [4]:
# Adding labels to the `ner`

for _, annotations in data:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])


In [5]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


In [6]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(1):

    # shuufling examples  before every iteration
    random.shuffle(data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    print("Losses", losses)

Losses {'ner': 222174.5624575764}


In [8]:
doc = nlp(data[3][0])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Vijayalakshmi Govindarajan', 'Name'), ('SAP as', 'Companies worked at'), ('SAP Basis -', 'Companies worked at')]


In [2]:
!python -m spacy download en_core_web_sm -q


[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
# Train NER from a blank spacy model
import spacy
import spacy
# Importing requirements
from spacy.util import minibatch, compounding
import random

tags_vals = ['Empty','UNKNOWN','Email Address', 'Links', 'Skills', 'Graduation Year', 'College Name', 'Degree', 'Companies worked at', 'Location', 'Name', 'Designation', 'projects', 'Years of Experience', 'Can Relocate to', 'Rewards and Achievements', 'Address', 'University', 'Relocate to', 'Certifications', 'state', 'links', 'College', 'training', 'des', 'abc']

#spacy.prefer_gpu()
nlp=spacy.load("en_core_web_sm") 

# Getting the ner component
ner=nlp.get_pipe('ner')

# Add the new label to ner
for i in tags_vals:
  ner.add_label(i)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
  print("------STARTING-------")
  sizes = compounding(1.0, 4.0, 1.001)
  # Training for 30 iterations     
  for itn in range(30):
    # shuffle examples before training
    random.shuffle(data)
    # batch up the examples using spaCy's minibatch
    batches = minibatch(data, size=sizes)
    # ictionary to store losses
    losses = {}
    for batch in batches:
      texts, annotations = zip(*batch)
      # Calling update() over the iteration
      nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
    print("Losses", losses)

------STARTING-------
Losses {'ner': 286893.8657830385}
Losses {'ner': 273688.8629359987}
Losses {'ner': 271085.58670354635}
Losses {'ner': 272182.9842940569}
Losses {'ner': 269024.11262334883}
Losses {'ner': 270455.4656434059}
Losses {'ner': 266808.90157675743}
Losses {'ner': 268779.64239776134}
Losses {'ner': 267750.3622187376}
Losses {'ner': 266806.4804009795}
Losses {'ner': 267340.73468375206}
Losses {'ner': 265742.1821196079}
Losses {'ner': 267040.25214481354}
Losses {'ner': 268229.3312683835}
Losses {'ner': 267109.0554922819}
Losses {'ner': 269100.0800716877}
Losses {'ner': 265854.11294198036}
Losses {'ner': 267282.1326575279}
Losses {'ner': 266667.692186594}
Losses {'ner': 265838.53541493416}
Losses {'ner': 266207.21528328815}
Losses {'ner': 266613.3947337866}
Losses {'ner': 265761.3803472519}
Losses {'ner': 266707.6562460512}
Losses {'ner': 262798.4459848404}
Losses {'ner': 264642.7012481466}
Losses {'ner': 264651.99743089825}
Losses {'ner': 268844.89882114343}
Losses {'ner': 2

In [30]:
import textract
import re

text1 = str(textract.process('/content/content/Sayli Sunil Gaikwad_14625.pdf'))
text1 = re.sub("\n","",text1)

In [31]:
doc = nlp(text1)
for ent in doc.ents:
    print(f'{ent.label_.upper():{40}}-{ent.text}')

NAME                                    -b'
DESIGNATION                             -Senior Test Engineer
LOCATION                                -Mumbai
LOCATION                                -Mumbai
LOCATION                                -Mumbai
DESIGNATION                             -Senior Test Engineer
LOCATION                                -Mumbai
LOCATION                                -Mumbai
LOCATION                                -Mumbai


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
nlp.to_disk("/tensorflow-1.15.2/new_30_resume_trains")

In [None]:
!zip -r /content/new.zip /content/content/ 