In [1]:
import io
import os
import re
import nltk
import pandas as pd
import docx2txt
import constants as cs
from datetime import datetime
from dateutil import relativedelta
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
import csv
from spacy.matcher import Matcher

In [48]:
degree=None
companies=None
designation=None
number=None
designation=None
college=None

In [3]:

def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files
    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                    pdf_path,
                    caching=True,
                    check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return


def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                        file_name,
                        caching=True,
                        check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                            fh,
                            caching=True,
                            check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None

def extract_text(file_path): 
    text = ''
    for page in extract_text_from_pdf(file_path):
            text += ' ' + page

    return text


In [4]:
path="/home/amogh/Forkaia/Resume-Survivor/Amogh-Sondur-Resume.pdf"

text=extract_text(path)

In [5]:
def extract_email(text):
    '''
    Helper function to extract email id from text
    :param text: plain text extracted from resume file
    '''
    email = re.findall(r"([^@|\s]+@[^@]+\.[^@|\s]+)", text)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

In [6]:
def extract_mobile_number(text):
        mob_num_regex = r'''(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)[-\.\s]*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'''
        phone = re.findall(re.compile(mob_num_regex), text)
        if phone:
            number = ''.join(phone[0])
        return number



In [7]:
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)


def extract_full_name(nlp_doc):
     pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
     matcher.add('FULL_NAME', None, pattern)
     matches = matcher(nlp_doc)
     for match_id, start, end in matches:
         span = nlp_doc[start:end]
         return span.text

## Skills

In [8]:
nlp = spacy.load('en_core_web_sm')
matcher=Matcher(nlp.vocab)
doc=nlp(text)
noun_chunks=doc.noun_chunks

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # reading the csv file
    data = pd.read_csv("/home/amogh/Forkaia/Resume-Survivor/skills.csv") 
    
    # extract values
    skills = list(data.columns.values)
     
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    # check for bi-grams and tri-grams (example: machine learning)
    for token in doc.noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]



## Education

In [9]:
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII','Masters','Bachelors','PhD'
        ]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education


## Experience

In [10]:
def extract_company(resume_text):
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize
    filtered_sentence = [
            w for w in word_tokens if w not
            in stop_words and wordnet_lemmatizer.lemmatize(w)
            not in stop_words
        ]
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)

    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)

    test = []

    for vp in list(
        cs.subtrees(filter=lambda x: x.label() == 'P')
    ):
        test.append(" ".join([
            i[0] for i in vp.leaves()
            if len(vp.leaves()) >= 2])
        )

    # Search the word 'experience' in the chunk and
    # then print out the text after it
    x = [
        x[x.lower().index('experience') + 10:]
        for i, x in enumerate(test)
        if x and 'experience' in x.lower()
    ]
    return x


## City

In [12]:
def city_finder(text_data):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text_data)
    for ents in doc.ents:
        if(ents.label_ == 'GPE'):
            return (ents.text)	


## Entities

In [13]:
RESUME_SECTIONS_GRAD = [
                    'work experience',
                    'accomplishments',
                    'WORK EXPERIENCE',
                    'experience',
                    'education',
                    'interests',
                    'projects',
                    'professional experience',
                    'publications',
                    'skills',
                    'certifications',
                    'objective',
                    'career objective',
                    'summary',
                    'leadership'
                    
                ]


def extract_entity_sections_grad(text):
    '''
    Helper function to extract all the raw text from sections of
    resume specifically for graduates and undergraduates
    :param text: Raw text of resume
    :return: dictionary of entities
    '''
    text_split = [i.strip() for i in text.split('\n')]
    # sections_in_resume = [i for i in text_split if i.lower() in sections]
    entities = {}
    key = False
    for phrase in text_split:
        if len(phrase) == 1:
            p_key = phrase
        else:
            p_key = set(phrase.lower().split()) & set(RESUME_SECTIONS_GRAD)
        try:
            p_key = list(p_key)[0]
        except IndexError:
            pass
        if p_key in RESUME_SECTIONS_GRAD:
            entities[p_key] = []
            key = p_key
        elif key and phrase.strip():
            entities[key].append(phrase)

    # entity_key = False
    # for entity in entities.keys():
    #     sub_entities = {}
    #     for entry in entities[entity]:
    #         if u'\u2022' not in entry:
    #             sub_entities[entry] = []
    #             entity_key = entry
    #         elif entity_key:
    #             sub_entities[entity_key].append(entry)
    #     entities[entity] = sub_entities

    # pprint.pprint(entities)

    # make entities that are not found None
    # for entity in cs.RESUME_SECTIONS:
    #     if entity not in entities.keys():
    #         entities[entity] = None
    return entities

entites=extract_entity_sections_grad(text)


## Custom Entities

In [14]:

"""Custom Entities"""

def extract_entities_wih_custom_model(custom_nlp_text):
    '''
    Helper function to extract different entities with custom
    trained model using SpaCy's NER
    :param custom_nlp_text: object of `spacy.tokens.doc.Doc`
    :return: dictionary of entities
    '''
    entities = {}
    for ent in custom_nlp_text.ents:
        if ent.label_ not in entities.keys():
            entities[ent.label_] = [ent.text]
        else:
            entities[ent.label_].append(ent.text)
    for key in entities.keys():
        entities[key] = list(set(entities[key]))
    return entities


custom_nlp=spacy.load(os.path.dirname(os.path.abspath(path)))
custom_nlp=custom_nlp(text)
cust_ent=extract_entities_wih_custom_model(custom_nlp)


# Outputs

In [49]:
try:
    name = cust_ent['Name'][0]
except (IndexError, KeyError):
    name=extract_full_name(nlp(text))

In [50]:
name

'Amogh Sondur'

In [51]:
email=extract_email(text)
email

'asondur@hawk.iit.edu'

In [52]:
number=extract_mobile_number(text)
number

'(317) 748-9445'

In [53]:
city=city_finder(text)
city

'CHICAGO'

In [54]:
skills=extract_skills(text)
skills

['Linux',
 'Tensorflow',
 '.net',
 'Windows',
 'Hadoop',
 'Automation',
 'Opencv',
 'C',
 'Aws',
 'Microsoft excel',
 'Java',
 'R',
 'Python',
 'Excel',
 'Javascript',
 'Keras',
 'Algorithms',
 'Numpy',
 'Machine learning',
 'System',
 'Pandas',
 'Css',
 'Engineering',
 'Green',
 'Mining',
 'Spark',
 'Matplotlib',
 'Powerpoint',
 'Computer science',
 'Database',
 'Sql',
 'Nltk',
 'Try',
 'Android',
 'Queries',
 'Tableau',
 'Word',
 'Php',
 'Hive',
 'Analytics',
 'Electrical',
 'C#',
 'Writing',
 'Customer service',
 'Html',
 'Unix',
 'Hotels',
 'C++',
 'Nosql',
 'Visual']

In [55]:
try:
    degree=cust_ent['Degree']
except KeyError:
    pass
degree

['Master of Science – Data Science',
 'Bachelor’s – Computer Science and Engineering']

In [56]:
try:
    college=entites['College Name']
except KeyError:
    pass


In [57]:
try:
    education=entites['education']
except KeyError:
    pass



In [58]:
try:
    degree=cust_ent['Degree']
except KeyError:
    pass
degree

['Master of Science – Data Science',
 'Bachelor’s – Computer Science and Engineering']

In [59]:
try:
    companies=cust_ent['Companies worked at']
except KeyError:
    pass
companies

['Amazon']

In [64]:
record={
         'Name':name,
         'Email':email,
         'Phone-no':number,
         'City':city,
         'Degree':degree,
         'Designation':designation,
         'Companies Worked At':companies,
          'Skills':skills
         }   


import json    

with open('resume.json', 'w') as outfile:
    json.dump(record, outfile)

In [65]:
record

{'Name': 'Amogh Sondur',
 'Email': 'asondur@hawk.iit.edu',
 'Phone-no': '(317) 748-9445',
 'City': 'CHICAGO',
 'Degree': ['Master of Science – Data Science',
  'Bachelor’s – Computer Science and Engineering'],
 'Designation': None,
 'Companies Worked At': ['Amazon'],
 'Skills': ['Linux',
  'Tensorflow',
  '.net',
  'Windows',
  'Hadoop',
  'Automation',
  'Opencv',
  'C',
  'Aws',
  'Microsoft excel',
  'Java',
  'R',
  'Python',
  'Excel',
  'Javascript',
  'Keras',
  'Algorithms',
  'Numpy',
  'Machine learning',
  'System',
  'Pandas',
  'Css',
  'Engineering',
  'Green',
  'Mining',
  'Spark',
  'Matplotlib',
  'Powerpoint',
  'Computer science',
  'Database',
  'Sql',
  'Nltk',
  'Try',
  'Android',
  'Queries',
  'Tableau',
  'Word',
  'Php',
  'Hive',
  'Analytics',
  'Electrical',
  'C#',
  'Writing',
  'Customer service',
  'Html',
  'Unix',
  'Hotels',
  'C++',
  'Nosql',
  'Visual']}