In [47]:
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
import spacy
from spacy.lang.en import English 

In [48]:
from pydantic import BaseModel
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import requests

In [49]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str

In [50]:
def toSpacyFormat(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [51]:
nlp = English()  # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

In [52]:
def train_spacy():

    TRAIN_DATA = toSpacyFormat("traindata.json")

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
#     #test the model and evaluate it
#     examples = convert_dataturks_to_spacy("testdata.json")
#     tp=0
#     tr=0
#     tf=0

#     ta=0
#     c=0        
#     for text,annot in examples:

#         f=open("resume"+str(c)+".txt","w")
#         doc_to_test=nlp(text)
#         d={}
#         for ent in doc_to_test.ents:
#             d[ent.label_]=[]
#         for ent in doc_to_test.ents:
#             d[ent.label_].append(ent.text)

#         for i in set(d.keys()):

#             f.write("\n\n")
#             f.write(i +":"+"\n")
#             for j in set(d[i]):
#                 f.write(j.replace('\n','')+"\n")
#         d={}
#         for ent in doc_to_test.ents:
#             d[ent.label_]=[0,0,0,0,0,0]
#         for ent in doc_to_test.ents:
#             doc_gold_text= nlp.make_doc(text)
#             gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
#             y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
#             y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
#             if(d[ent.label_][0]==0):
#                 #f.write("For Entity "+ent.label_+"\n")   
#                 #f.write(classification_report(y_true, y_pred)+"\n")
#                 (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
#                 a=accuracy_score(y_true,y_pred)
#                 d[ent.label_][0]=1
#                 d[ent.label_][1]+=p
#                 d[ent.label_][2]+=r
#                 d[ent.label_][3]+=f
#                 d[ent.label_][4]+=a
#                 d[ent.label_][5]+=1
#         c+=1
#     for i in d:
#         print("\n For Entity "+i+"\n")
#         print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
#         print("Precision : "+str(d[i][1]/d[i][5]))
#         print("Recall : "+str(d[i][2]/d[i][5]))
#         print("F-score : "+str(d[i][3]/d[i][5]))

In [53]:
def test_spacy():
    #test the model and evaluate it
    examples = toSpacyFormat("testdata.json")
    tp=0
    tr=0
    tf=0

    ta=0
    c=0        
    for text,annot in examples:
        f=open("resume"+str(c)+".txt","w")
        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):

            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))

In [54]:
train_spacy()

Statring iteration 0
{'ner': 6079.00515327674}
Statring iteration 1
{'ner': 3952.8180454896874}
Statring iteration 2
{'ner': 3161.1680798167044}
Statring iteration 3
{'ner': 2784.541548090144}
Statring iteration 4
{'ner': 2429.6215811912843}
Statring iteration 5
{'ner': 2133.6430278076527}
Statring iteration 6
{'ner': 1940.8564015718407}
Statring iteration 7
{'ner': 1812.774721963551}
Statring iteration 8
{'ner': 1631.7183162941312}
Statring iteration 9
{'ner': 1452.5932975334674}


In [55]:
test_spacy()

  'recall', 'true', average, warn_for)



 For Entity Name

Accuracy : 99.83805668016194%
Precision : 0.9983831936194594
Recall : 0.9983805668016195
F-score : 0.9981113185060555

 For Entity Location

Accuracy : 99.27125506072875%
Precision : 0.9927657005623397
Recall : 0.9927125506072875
F-score : 0.9897446574315648

 For Entity Companies worked at

Accuracy : 98.78542510121457%
Precision : 1.0
Recall : 0.9878542510121457
F-score : 0.9938900203665988

 For Entity Designation

Accuracy : 99.83805668016194%
Precision : 1.0
Recall : 0.9983805668016195
F-score : 0.9991896272285252

 For Entity College Name

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Graduation Year

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Skills

Accuracy : 97.4089068825911%
Precision : 0.9747631743523912
Recall : 0.974089068825911
F-score : 0.9643889853895924


In [56]:
nlp.to_disk('results1')

In [57]:
nlp1 = spacy.load("results1")

In [58]:
resume_text = convert_pdf_to_txt('Profile.pdf')
results = {}
doc_to_test=nlp1(resume_text)
d={}
for ent in doc_to_test.ents:
    d[ent.label_]=[]
for ent in doc_to_test.ents:
    d[ent.label_].append(ent.text)
for i in set(d.keys()):
    results[i] = d[i]
results

{'Companies worked at': ['Android Developer'],
 'College Name': ['SRM University'],
 'Location': ['Hyderabad', 'Kolkata'],
 'Degree': ['Bachelor of Technology (B.Tech.),\xa0Computer Science\xa0·\xa0(2015\xa0-\xa02019)'],
 'Name': ['\xa0\n\n\xa0\n\n\xa0\n\nContact'],
 'Designation': ['Mrinal Chandra',
  'Software Engineer',
  'Data Structures',
  'Software Engineer',
  'Associate Software Developer',
  'Teaching Assistant',
  'Android Developer'],
 'Graduation Year': ['2018', '2018', '2014']}

In [59]:
resume_text = convert_pdf_to_txt('Profile.pdf')
results = {}
doc_to_test=nlp(resume_text)
d={}
for ent in doc_to_test.ents:
    d[ent.label_]=[]
for ent in doc_to_test.ents:
    d[ent.label_].append(ent.text)
for i in set(d.keys()):
    results[i] = d[i]
results

{'Companies worked at': ['Android Developer'],
 'College Name': ['SRM University'],
 'Location': ['Hyderabad', 'Kolkata'],
 'Degree': ['Bachelor of Technology (B.Tech.),\xa0Computer Science\xa0·\xa0(2015\xa0-\xa02019)'],
 'Name': ['\xa0\n\n\xa0\n\n\xa0\n\nContact'],
 'Designation': ['Mrinal Chandra',
  'Software Engineer',
  'Data Structures',
  'Software Engineer',
  'Associate Software Developer',
  'Teaching Assistant',
  'Android Developer'],
 'Graduation Year': ['2018', '2018', '2014']}

In [72]:
def create_item(url: str):
    r = requests.get(url, stream = True) 
    with open("python.pdf","wb") as pdf:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk) 
    resume_text = convert_pdf_to_txt('python.pdf')
    output_dir = 'results1'
    nlp = spacy.load(output_dir)
    doc_to_test=nlp(resume_text)
    d={}
    for ent in doc_to_test.ents:
        d[ent.label_]=[]
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)
    for i in set(d.keys()):
        results[i] = d[i]
    return results

In [71]:
import requests 
create_item('https://storage.googleapis.com/platform-documents/dev/demo_ee45d6c4c7834716af229011ab7bda08/Other/b1af68b756184e3abd9b7b6d18026ebe?GoogleAccessId=antelopesprinkle-ko@staging-auzmor.iam.gserviceaccount.com&Expires=1575016687&Signature=abJDKCLUozPZVIlJDYNGiAAtrAcq5t6Q4VcySk4hzwuUua2VpPVsoiUXl%2FziPaR4xwo8hE1LSCokqo3114%2FgwkWEDeQ6hc1%2Fp4Fm1nPRdO%2B4t%2Fq7oTh%2BtosBD0yCa2k7eflyEokFc9cPDGrbAc4z1No%2FeDxoT4Kj05MgvNC12B0OOHFEcbhtiIloT6kyE%2FFbgNI07dAZW31K0VcowJRt6WkfFytWM94jOYQIbHm5Z%2FU5jUxR9gXzAYHWti9Xl%2BZnqVwTrdqS3lXFo%2BzBpdh%2FH%2BdRgWi28zTR8ATE%2BWU1bSOb8KPOMhfyf7lauRk%2FW9ITQpzV9O0Gi8IfhfUaczft0Q%3D%3D')

b'%PDF-1.4\n%\xaa\xab\xac\xad\n1 0 obj\n<<\n/Title (Resume)\n/Author (LinkedIn)\n/Subject (Resume generated from profile)\n/Producer (Apache FOP Version 2.2)\n/CreationDate (D:20191125092855Z)\n>>\nendobj\n2 0 obj\n<<\n  /N 3\n  /Length 3 0 R\n  /Filter /FlateDecode\n>>\nstream\nx\x9c\x9d\x96wXS\xe7\x1e\xc7\xdfsN\xf6`$!l\x08{\x86\xa5@\x00\x91\x11\xa6\x80\x0c\xd9\xa2\x10\x92\x00\x01\x12 $\x0c\xf7@T\xb0\xa2\xa8\xc8R\x04)\x8aX\xb0Z\x86\xd4\x89(\x0e\x8a\xe2\xde\rR\x04\x94Z\xac\xe2\xc2\xd1D\x9e\xa7\xf5\xf6\xf6\xde\xdb\xdb\xef\x1f\xe7|\x9e\xdf\xfb\xfb\xbd\xe7\xfd\x8d\xf7y\x0e\x00\xa4\x80L\xae0\x17V\x01@(\x92\x88#\xfc\xbd\x19\xb1q\xf1\x0c\xec\x00\x80\x01\x1e`\x80=\x00\x1cnn\xb6WXX0\x90+\xd0\x97\xcd\xc8\x95;\x81\x7f\xd1\xab\x9b\x00R\xbc\xaf1\x15{\x81\xffO\xaa\xdcl\xb1\x04\x00(L\xce\xb3x\xfc\\\xae\x9c\x8b\xe4\x9c\x99/\xc9V\xd8\'\xe5LK\xceP0\x8cR\xb0X~@9k(8u\x86\xad?\xfb\xcc\xb0\xa7\x82yB\x11O\xce\x91r\xce\xe6\ty\n\xee\x95\xf3\x86<)_\xce\x88"\x97\xe2<\x01?_\xce\xd7\xe5l\x9c)\x15\n\xe4\xfcF\x11+\

{'Companies worked at': ['Android Developer'],
 'College Name': ['SRM University'],
 'Location': ['Hyderabad', 'Kolkata'],
 'Degree': ['Bachelor of Technology (B.Tech.),\xa0Computer Science\xa0·\xa0(2015\xa0-\xa02019)'],
 'Name': ['\xa0\n\n\xa0\n\n\xa0\n\nContact'],
 'Designation': ['Mrinal Chandra',
  'Software Engineer',
  'Data Structures',
  'Software Engineer',
  'Associate Software Developer',
  'Teaching Assistant',
  'Android Developer'],
 'Graduation Year': ['2018', '2018', '2014']}