# IMPLIMENTTION
Candidate: Doan Nguyen


# PDF Content Extraction and Organization:
This component is implemented using Python to read and extract content from candidate CVs stored in PDF format and organize the extracted information into an Excel file.

The PDF files are stored in a folder on Google Drive.

The Excel file, named Output.xlsx, is created and stored in the same folder.

To run this file, you need copy my OpenAI key from the MS Word file (I sent via email) and use it in the following cell.

In [13]:
# Please copy my OpenAI key from the MS Word file (I sent via email)

# Set up OpenAI API key
openai.api_key =

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import os

# solution
folder_path = "/content/drive/MyDrive/UNI/VU/PDFs/"

In [9]:
# Change Python's current working directory
os.chdir(folder_path)
# Print the name and contents of the current working directory
!pwd
!ls -al

/content/drive/MyDrive/UNI/VU/PDFs
total 520
-rw------- 1 root root   8642 Oct  7 06:00 output.xlsx
-rw------- 1 root root 325424 Oct  4 17:17 sample1.pdf
-rw------- 1 root root 197629 Oct  4 17:17 sample2.pdf


In [10]:
# Hide output for his cell
%%capture
!pip install openai==0.28.0


In [11]:
# Hide output for his cell
%%capture
!pip install PyPDF2

In [12]:
import openai
import PyPDF2
import pandas as pd


In [14]:
# Function to read PDFs and extract text
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)

            print(f"Processing file: {file_name}")

            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in range(len(reader.pages)):
                    text += reader.pages[page].extract_text()
                pdf_texts.append((file_name, text))
    return pdf_texts

In [15]:
import json
import re

# Function to process text with GPT-4o and parse JSON output
def process_text_with_llm(text):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Extract CV details and organize them into valid JSON format."},
            {"role": "user", "content": text}
        ]
    )

    structured_info = response['choices'][0]['message']['content']

    # Remove code block markers if they exist
    cleaned_info = re.sub(r"```json|```", "", structured_info)

    # Debug print statement to see the cleaned output
    print("Cleaned model output:", cleaned_info)

    # Attempt to parse the cleaned JSON output
    try:
        structured_info_dict = json.loads(cleaned_info)
    except json.JSONDecodeError:
        print("Error: Failed to parse JSON after cleaning. Please check the model's response format.")
        structured_info_dict = {}  # Use an empty dictionary to avoid further errors

    return structured_info_dict


In [16]:
# Function to save data dictionary to Excel
def save_to_excel(data, excel_path):
    # Convert the dictionary to a DataFrame
    dataframe = pd.DataFrame(data)

    # Save the DataFrame to an Excel file
    dataframe.to_excel(excel_path, index=False)



In [17]:
def flatten_data_for_excel(structured_info):
    flattened_data = {}

    # Helper function to process nested content
    def process_nested(item):
        if isinstance(item, dict):
            return "; ".join([f"{k.capitalize()}: {process_nested(v)}" for k, v in item.items()])
        elif isinstance(item, list):
            return ", ".join([process_nested(sub_item) for sub_item in item])
        else:
            return str(item)

    # Personal Details
    personal_info = structured_info.get('personal_details', {})
    for key, value in personal_info.items():
        flattened_data[f'Personal - {key.capitalize()}'] = process_nested(value)

    # Process other sections dynamically
    for section, content in structured_info.items():
        if section != 'personal_details':  # Skip personal details as already processed
            flattened_data[section.capitalize()] = process_nested(content)

    return flattened_data


In [18]:
def main(folder_path):
    # List all files in the folder
    pdf_texts = extract_text_from_pdfs(folder_path)
    output_file_path = os.path.join(folder_path, 'output.xlsx')

    # Open ExcelWriter for writing to Excel file
    with pd.ExcelWriter(output_file_path) as writer:
        for i, (file_name, pdf_text) in enumerate(pdf_texts):
            # Split text into chunks to avoid exceeding token limits
            chunks = split_text_into_chunks(pdf_text, max_tokens=1000)  # Adjust chunk size to fit token limits
            structured_info = []
            for chunk in chunks:
                structured_info.append(process_text_with_llm(chunk))  # Process each chunk separately

            # Combine structured info and flatten for Excel output
            combined_info = combine_chunks(structured_info)
            flattened_data = flatten_data_for_excel(combined_info)  # Dynamically flatten based on structure

            # Convert flattened data to DataFrame with each key-value as a row for flexibility
            df = pd.DataFrame(list(flattened_data.items()), columns=['Section', 'Content'])
            sheet_name = f"Candidate_{i+1}"
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Excel file 'output.xlsx' with each candidate in a separate sheet has been saved to {output_file_path}")


In [19]:
def split_text_into_chunks(text, max_tokens=1000):
    """Splits text into chunks small enough for the LLM API limits."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_tokens = 0
    for word in words:
        current_tokens += len(word)  # Approximate token count with word length
        if current_tokens > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_tokens = len(word)
        else:
            current_chunk.append(word)
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks


In [20]:
def combine_chunks(chunks):
    """Combines processed chunks back into a single structure."""
    combined_info = {}
    for chunk in chunks:
        for key, value in chunk.items():
            if key not in combined_info:
                combined_info[key] = value
            else:
                # If it's a list, append new values, otherwise overwrite
                if isinstance(combined_info[key], list):
                    combined_info[key].extend(value)
                else:
                    combined_info[key] = value
    return combined_info


main(folder_path)

Processing file: sample1.pdf
Processing file: sample2.pdf
Processing file: CV_DoanNguyen.pdf
Cleaned model output: 
{
  "name": "John Doe",
  "contact": {
    "location": "Your Location",
    "email": "youremail@yourdomain.com",
    "phone": "0541 999 99 99",
    "website": "yourwebsite.com",
    "linkedin": "yourusername",
    "github": "yourusername"
  },
  "education": [
    {
      "degree": "BS",
      "institution": "University of Pennsylvania",
      "field": "Computer Science",
      "gpa": "3.9/4.0",
      "coursework": [
        "Computer Architecture",
        "Comparison of Learning Algorithms",
        "Computational Theory"
      ],
      "dates": "Sept 2000 – May 2005"
    }
  ],
  "experience": [
    {
      "company": "Apple",
      "position": "Software"
    }
  ]
}

Cleaned model output: 
{
  "name": "John Doe",
  "experiences": [
    {
      "position": "Engineer",
      "location": "Cupertino, CA",
      "tasks": [
        "Reduced time to render user buddy lists b