In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

# solution
folder_path = "/content/drive/MyDrive/UNI/VU/PDFs/"

In [3]:
# Change Python's current working directory
os.chdir(folder_path)
# Print the name and contents of the current working directory
!pwd
!ls -al

/content/drive/MyDrive/UNI/VU/PDFs
total 683
-rw------- 1 root root 154251 Aug 24 11:02 CV_DoanNguyen.pdf
-rw------- 1 root root  21484 Oct  4 17:57 output.xlsx
-rw------- 1 root root 325424 Oct  4 17:17 sample1.pdf
-rw------- 1 root root 197629 Oct  4 17:17 sample2.pdf


In [4]:
# Hide output for his cell
%%capture
!pip install openai==0.28.0


In [5]:
# Hide output for his cell
%%capture
!pip install PyPDF2

In [6]:

import openai
import PyPDF2
import pandas as pd


In [7]:
# Set up OpenAI API key
openai.api_key = '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'

In [8]:
# Function to read PDFs and extract text
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)

            print(f"Processing file: {file_name}")

            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in range(len(reader.pages)):
                    text += reader.pages[page].extract_text()
                pdf_texts.append((file_name, text))
    return pdf_texts

In [9]:
import json
import re

# Function to process text with GPT-4o and parse JSON output
def process_text_with_llm(text):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Extract CV details and organize them into valid JSON format."},
            {"role": "user", "content": text}
        ]
    )

    structured_info = response['choices'][0]['message']['content']

    # Remove code block markers if they exist
    cleaned_info = re.sub(r"```json|```", "", structured_info)

    # Debug print statement to see the cleaned output
    print("Cleaned model output:", cleaned_info)

    # Attempt to parse the cleaned JSON output
    try:
        structured_info_dict = json.loads(cleaned_info)
    except json.JSONDecodeError:
        print("Error: Failed to parse JSON after cleaning. Please check the model's response format.")
        structured_info_dict = {}  # Use an empty dictionary to avoid further errors

    return structured_info_dict


# Example modification for extract_to_dataframe to handle list of tuples
def extract_to_dataframe(pdf_texts):
    data = {
        'File Name': [],
        'Name': [],
        'Job Experience': [],
        'Education': []
    }

    # Iterate over list of (file_name, pdf_text) tuples
    for file_name, pdf_text in pdf_texts:
        structured_info = process_text_with_llm(pdf_text)

        # Append data from dictionary if available, or handle missing keys
        data['File Name'].append(file_name)
        data['Name'].append(structured_info.get('Name', 'N/A'))
        data['Job Experience'].append(structured_info.get('Job Experience', 'N/A'))
        data['Education'].append(structured_info.get('Education', 'N/A'))

    return data


# Function to convert structured data to DataFrame
def extract_to_dataframe(pdf_texts):
    data = {'File Name': [], 'Name': [], 'Job Experience': [], 'Education': [], 'Skills': []}

    for file_name, text in pdf_texts:
        structured_info = process_text_with_llm(text)
        # Assuming the structured information is returned as a dictionary
        data['File Name'].append(file_name)
        data['Name'].append(structured_info.get('Name'))
        data['Job Experience'].append(structured_info.get('Job Experience'))
        data['Education'].append(structured_info.get('Education'))
        data['Skills'].append(structured_info.get('Skills'))

    return pd.DataFrame(data)

In [10]:
# Function to save data dictionary to Excel
def save_to_excel(data, excel_path):
    # Convert the dictionary to a DataFrame
    dataframe = pd.DataFrame(data)

    # Save the DataFrame to an Excel file
    dataframe.to_excel(excel_path, index=False)





In [11]:
def flatten_data_for_excel(structured_info):
    flattened_data = {}

    # Helper function to process nested content
    def process_nested(item):
        if isinstance(item, dict):
            return "; ".join([f"{k.capitalize()}: {process_nested(v)}" for k, v in item.items()])
        elif isinstance(item, list):
            return ", ".join([process_nested(sub_item) for sub_item in item])
        else:
            return str(item)

    # Personal Details
    personal_info = structured_info.get('personal_details', {})
    for key, value in personal_info.items():
        flattened_data[f'Personal - {key.capitalize()}'] = process_nested(value)

    # Process other sections dynamically
    for section, content in structured_info.items():
        if section != 'personal_details':  # Skip personal details as already processed
            flattened_data[section.capitalize()] = process_nested(content)

    return flattened_data


new code

In [12]:

def main(folder_path):
    pdf_texts = extract_text_from_pdfs(folder_path)
    output_file_path = os.path.join(folder_path, 'output.xlsx')

    with pd.ExcelWriter(output_file_path) as writer:
        for i, (file_name, pdf_text) in enumerate(pdf_texts):
            structured_info = process_text_with_llm(pdf_text)  # Process PDF to get structured data
            flattened_data = flatten_data_for_excel(structured_info)  # Dynamically flatten based on structure

            # Convert flattened data to DataFrame with each key-value as a row for flexibility
            df = pd.DataFrame(list(flattened_data.items()), columns=['Section', 'Content'])
            sheet_name = f"CV_{i+1}"
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Excel file 'output.xlsx' with each CV in a separate sheet has been saved to {output_file_path}")

# Run the updated process
main(folder_path)


Processing file: CV_DoanNguyen.pdf
Processing file: sample1.pdf
Processing file: sample2.pdf
Cleaned model output: 
{
  "personal_information": {
    "name": "Doan Nguyen",
    "location": "Melbourne, Australia",
    "phone": "H0452463137",
    "email": "bdoan310a@gmail.com",
    "website": "www.latrobe.edu.au/onguyen"
  },
  "summary": "Dedicated educator and researcher with a PhD and a strong foundation in Information Technology and Mathematics. Over 20 years of experience in lecturing, coordinating, and designing courses for both postgraduate and undergraduate levels. Research spans disciplines such as Machine Learning, Data Science, Cybersecurity, and Recommender Systems.",
  "education": [
    {
      "degree": "Doctor of Philosophy",
      "institution": "Japan Advanced Institute of Science and Technology",
      "location": "Japan",
      "date": "Mar. 2017",
      "dissertation": "A Study on Recommender Systems Based on Dempster-Shafer Theory"
    },
    {
      "degree": "Mast