Read all the resume from iNeuron git in PDF and Word, and try to create a dataframe with
- resume name as index value; and
- email id, linkedin id, github id, skills as columns.

In [63]:
import pandas as pd
from striprtf.striprtf import rtf_to_text
import textract
import PyPDF2
import re
import os

In [68]:
def match_pattern(element, pattern, text):
    try:
        element = re.search(pattern, text)
        if element:
            element = element.group(0)
        return element
    except Exception as e:
        return e

def processDocs(text, emailpattern, linkedinpattern, githubpattern,
                email=None, linkedinId=None, githubId=None, skills=[]):
    try:
        if not email: 
            email = match_pattern(email, emailpattern, text)
        if not linkedinId:
            linkedinId = match_pattern(linkedinId, linkedinpattern, text)
        if not githubId:
            githubId = match_pattern(githubId, githubpattern, text)
        if 'skill' in text.lower(): # Not really what I want
            pass

        return {"email": email, 
                "linkedin id": linkedinId, 
                "github id": githubId, "skills": skills }
    except Exception as e:
        return e
    

def getPDFLinks(filepath):
    try:
        with open(filepath,'rb') as PDFFile:
            PDF = PyPDF2.PdfFileReader(PDFFile, strict=False)
            pages = PDF.getNumPages()
            key = '/Annots'
            uri = '/URI'
            ank = '/A'
            links = []

            for page in range(pages):
                pageSliced = PDF.getPage(page)
                pageObject = pageSliced.getObject()
                if key in pageObject.keys():
                    ann = pageObject[key]
                    for a in ann:
                        u = a.getObject()
                        if uri in u[ank].keys():
                            link = re.sub('mailto:', '', u[ank][uri])
                            links.append(link)
        return links
    except Exception as e:
        return e

    
def getFileText(file):    
    try:
        _, fextention = os.path.splitext(file)

        if fextention.lower() == '.pdf':
            text = textract.process(file, method='pdftotext')
            text = text.decode('utf-8') + ' '.join(getPDFLinks(file))

            return text
        elif fextention.lower() == '.docx':
            text = textract.process(file, extension='docx')
            text = text.decode('utf-8')
        else:
            with open(file, 'r', encoding = "ISO-8859-1") as f:
                text = f.read()
                text = rtf_to_text(text)

        return text
    except Exception as e:
        return e

def cvParser(dirpath):
    try:
        emailregex = r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        linkedinregex = r'(http(s?):\/\/)?(www\.)?linkedin\.com\/in\/([A-Za-z0-9]{1,})'
        githubregex =   r'(http(s?):\/\/)?(www\.)?github\.com\/([A-Za-z0-9]{1,})'
        mylist = []

        for filename in os.listdir(dirpath):
            file = os.path.join(dirpath, filename)
            _, fextention = os.path.splitext(file)
            obj = {"name": filename }

            text = getFileText(file)
                
            obj.update(processDocs(
                    text, emailregex, linkedinregex, githubregex))

            mylist.append(obj)
        return mylist
    except Exception as e:
        return e

In [69]:
resdir = 'C:\\Users\\dell\\iNeuron\\python\\Pandas\\pandas_assessment2\\resumes'
cvlist = cvParser(resdir)
cvlist
dframe = pd.DataFrame(cvlist)
dframe.set_index('name')

Unnamed: 0_level_0,email,linkedin id,github id,skills
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12+ (2).docx,,,,[]
12+.docx,sandeep.dasc1@gmail.com,www.linkedin.com/in/pulavarthy,https://github.com/sandeepdasc1,[]
15+ (1).pdf,,,,[]
15+.pdf,,,,[]
20.pdf,,,,[]
3+ (2).docx,fsdsnov21@ineuron.ai,https://www.linkedin.com/in/fsdsnov21,https://www.github.com/fsdsnov21,[]
3+.docx,,,,[]
3+.pdf,,,,[]
4 .doc,,,,[]
4+.docx,,,,[]
