In [11]:
import os
import fitz
import docx2txt
import pandas as pd
import re
import yaml
yaml_file_path= 'config.yaml'

with open(yaml_file_path, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)

In [13]:
def scan_and_extract(folder_path, output_path):
    try:
        # Lists all the files in the folder
        files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        print(f"Total number of CVs in the folder: {len(files)}")

        pdf_contents = {}
        docx_contents = {}

        for file in files:
            file_path = os.path.join(folder_path, file)

            if file.lower().endswith('.pdf'):
                pdf_contents[file] = extract_pdf_contents(file_path)
                save_to_file(pdf_contents[file], output_path, file.replace('.pdf', '_output.txt'))

            elif file.lower().endswith('.docx'):
                docx_contents[file] = extract_docx_contents(file_path)
                save_to_file(docx_contents[file], output_path, file.replace('.docx', '_output.txt'))
        
    except FileNotFoundError:
        print(f"The specified folder '{folder_path}' does not exist.")
    except PermissionError:
        print(f"Permission error while accessing folder '{folder_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def extract_pdf_contents(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc[page_num]
            text += page.get_text()
        doc.close()
    except Exception as e:
        print(f"Error extracting PDF contents from '{pdf_path}': {e}")
    return text

def extract_docx_contents(docx_path):
    text = ""
    try:
        text = docx2txt.process(docx_path)
    except Exception as e:
        print(f"Error extracting DOCX contents from '{docx_path}': {e}")
    return text

def save_to_file(content, folder, filename):
    output_path = os.path.join(folder, filename)
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(content)

cv_inp = config['path']['cv_inp']
cv_out = config['path']['cv_out']
scan_and_extract(cv_inp, cv_out)

def extract_information_from_txt(txt_path):
    try:
        with open(txt_path, 'r', encoding='utf-8') as txt_file:
            text = txt_file.read()

        # Define regular expressions for extracting information
        name_pattern = re.compile(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+)*)\b')
        email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        phone_pattern = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')
        address_pattern = re.compile(r'\b\d{1,5}[^a-zA-Z0-9]*[a-zA-Z0-9\s,.#]+\b')
        skills_pattern = re.compile(r'\b(?:skills)\b[^:]*:\s*([^\n]*)', re.IGNORECASE)


        # Initialize variables
        name = address = phone = email = skills = ""

        # Extract information using regular expressions
        match_name = re.search(name_pattern, text)
        if match_name:
            name = match_name.group(1).strip()

        match_address = re.search(address_pattern, text)
        if match_address:
            address = match_address.group(0).strip()

        match_phone = re.search(phone_pattern, text)
        if match_phone:
            phone = match_phone.group(0).strip()

        match_email = re.search(email_pattern, text)
        if match_email:
            email = match_email.group(0).strip()

        match_skills = re.findall(skills_pattern, text)
        if match_skills:
            skills = match_skills


        return {'Name': name, 'Address': address, 'Phone': phone, 'Email': email, 'Skills': skills}

    except Exception as e:
        print(f"Error extracting information from '{txt_path}': {e}")
        return None

def process_output_folder(output_folder):
    data = []

    for file_name in os.listdir(output_folder):
        if file_name.lower().endswith('_output.txt'):
            txt_path = os.path.join(output_folder, file_name)
            information = extract_information_from_txt(txt_path)
            
            if information:
                information['File'] = file_name.replace('_output.txt', '')
                data.append(information)

    df = pd.DataFrame(data)
    return df

ext_inf = config['path']['ext_inf']
df = process_output_folder(ext_inf)
print("Extracted Information:")
df

Total number of CVs in the folder: 9
Extracted Information:


Unnamed: 0,Name,Address,Phone,Email,Skills,File
0,Manager,617.555.0123\n\nvictoria,617.555.0123,victoria@example.com,,cv1
1,Kushal Bhattarai,18\n\n+977 9847207123\n\nkushalbhattarai58,+977 9847207123,kushalbhattarai58@gmail.com,,ResumeKushal
2,Jack,07123 456 789\n\nEmail\n\n\n\njack.mclaughlin,7123 456 789,jack.mclaughlin@gmail.com,[How to write a CV],cv3
3,Roger Nadal,07123 456 789\nEmail\nrogernadal,7123 456 789,rogernadal@gmail.com,[],cv4
4,Python,977-9860476116\nSinamangal,+977-9860476116,pratik.pudasaini13@gmail.com,[//github.com/prateekgh],PRateek-Pudasainee
5,Python,977-9862699011,+977-9862699011,sushan074bex@ioepc.edu.np,,CV_sushan
6,Kushal Bhattarai,13th IOE Graduate Conference proceedings,,kushal.bhattarai@fusemachines.com,[Python],KushalBhattaraiCV
7,Helena Maguire,"07123 456 789\n\n17 Priory Road, London, N22 6...",7123 456 789,helena.maguire@gmail.com,[How to write a CV],cv5
8,Dedicated Sales Associate,5+ years of experience in retail environments....,5911 135476,olivia.davies@gmail.com,,cv2


In [43]:
#Saving the dataframe in a folder
pd2csv = config['path']['pd2csv']
df.to_csv(pd2csv)