Read all the resume from iNeuron git in PDF and Word, and try to create a dataframe with
- resume name as index value; and
- email id, linkedin id, github id, skills as columns.

In [4]:
import pandas as pd
from docx import Document
from striprtf.striprtf import rtf_to_text
import PyPDF2
import pdfx
import re
import os

In [14]:
def match_pattern(element, pattern, text):
    try:
        element = re.search(pattern, text)
        if element:
            element = element.group(0)
            #print('******', element)
        return element
    except Exception as e:
        return e

def processDocs(text, emailpattern, linkedinpattern, githubpattern,
                email, linkedinId, githubId, skills):
    try:
        if not email: 
            email = match_pattern(email, emailpattern, text)
        if not linkedinId:
            linkedinId = match_pattern(linkedinId, linkedinpattern, text)
        if not githubId:
            githubId = match_pattern(githubId, githubpattern, text)
        if 'skill' in text.lower(): # Not really what I want
            pass

        return {"email": email, 
                "linkedin id": linkedinId, 
                "github id": githubId, "skills": skills }
    except Exception as e:
        return e
    
def processWordDocument(filepath, emailpattern, 
                        linkedinpattern, githubpattern, email=None,
                        linkedinId=None, githubId=None, skills = []):
    try:
        doc = Document(filepath)
        resultText = ''
        for p in doc.paragraphs:
            resultText += p.text

        return processDocs(resultText, emailpattern, 
                           linkedinpattern, githubpattern,
                           email, linkedinId, githubId, skills)
    except Exception as e:
        return e

def getPDFLinks(filepath):
    try:
        with open(filepath,'rb') as PDFFile:
            PDF = PyPDF2.PdfFileReader(PDFFile)
            pages = PDF.getNumPages()
            key = '/Annots'
            uri = '/URI'
            ank = '/A'
            links = []

            for page in range(pages):
                pageSliced = PDF.getPage(page)
                pageObject = pageSliced.getObject()
                if key in pageObject.keys():
                    ann = pageObject[key]
                    for a in ann:
                        u = a.getObject()
                        if uri in u[ank].keys():
                            link = re.sub('mailto:', '', u[ank][uri])
                            links.append(link)
        return links
    except Exception as e:
        return e

def processPDFDocs(filepath, emailpattern, 
                    linkedinpattern, githubpattern, email=None,
                    linkedinId=None, githubId=None, skills = []):
    try:
        linksInPDF = ' '.join(getPDFLinks(filepath))
        pdfText = pdfx.PDFx(filepath).get_text()
        pdfText += linksInPDF

        return processDocs(pdfText, emailpattern, linkedinpattern, 
                           githubpattern, email, linkedinId, githubId, skills)
    except Exception as e:
        return e
    
def processGenericDocs(filepath, emailpattern, 
                    linkedinpattern, githubpattern, email=None,
                    linkedinId=None, githubId=None, skills = []):
    try: 
        with open(filepath, 'r', encoding = "ISO-8859-1") as f:
            content = f.read()
            content = rtf_to_text(content)
            result = processDocs(content, emailpattern, 
                                 linkedinpattern, githubpattern,
                                 email, linkedinId, githubId, skills)
        return result
    except Exception as e:
        return e

def cvParser(dirpath):
    try:
        emailregex = r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        linkedinregex = r'(http(s?):\/\/)?(www\.)?linkedin\.com\/in\/([A-Za-z0-9]{1,})'
        githubregex =   r'(http(s?):\/\/)?(www\.)?github\.com\/([A-Za-z0-9]{1,})'
        mylist = []

        for filename in os.listdir(resdir):
            file = os.path.join(resdir, filename)
            fname, fextention = os.path.splitext(file)
            obj = {"name": filename }

            if fextention.lower() == '.pdf':
                obj.update(processPDFDocs(
                    file, emailregex, linkedinregex, githubregex))

            elif fextention.lower() == '.docx':
                obj.update(processWordDocument(
                    file, emailregex, linkedinregex, githubregex))
            else:
                obj.update(processGenericDocs(
                    file, emailregex, linkedinregex, githubregex))

            mylist.append(obj)
        return mylist
    except Exception as e:
        return e


In [15]:
resdir = 'C:\\Users\\dell\\iNeuron\\python\\Pandas\\pandas_assessment2\\resumes'
cvlist = cvParser(resdir)

dframe = pd.DataFrame(cvlist)
dframe.set_index('name')



Unnamed: 0_level_0,email,linkedin id,github id,skills
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12+ (2).docx,,,,[]
12+.docx,,,,[]
15+ (1).pdf,,,,[]
15+.pdf,,,,[]
20.pdf,,,,[]
3+ (2).docx,,,,[]
3+.docx,,,,[]
3+.pdf,,,,[]
4 .doc,,,,[]
4+.docx,,,,[]


In [4]:
pdf = pdfx.extract_urls(pdfx.PDFx('resumes/mteh fresher.pdf').uri)
# 'resumes/5+ .pdf'

In [12]:
emailregex = r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
linkedinregex = r'(http(s?):\/\/)?(www\.)?linkedin\.com\/in\/([A-Za-z0-9]{1,})'
githubregex =   r'(http(s?):\/\/)?(www\.)?github\.com\/([A-Za-z0-9]{1,})'

In [16]:
processWordDocument('resumes/3+ (2).docx', emailregex, linkedinregex, githubregex)

{'email': None, 'linkedin id': None, 'github id': None, 'skills': []}